diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml index 08f36d4c9..13303e6cb 100644 --- a/.pylintrc-local.yml +++ b/.pylintrc-local.yml @@ -1,14 +1,6 @@ - arg: ignore val: - mappers - - gas_dynamics - - burgers.py - - diffusion.py - - dt_finding.py - - nd_calculus.py - - pml.py - - poisson.py - - second_order.py - arg: ignored-modules val: - sympy diff --git a/doc/conf.py b/doc/conf.py index 0cd5ba6d6..1ff3ca070 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -35,6 +35,7 @@ def get_version(): "https://documen.tician.de/arraycontext/": None, "https://documen.tician.de/meshmode/": None, "https://documen.tician.de/loopy/": None, + "https://mpi4py.readthedocs.io/en/stable": None, } # index-page demo uses pyopencl via plot_directive diff --git a/examples/advection/var-velocity.py b/examples/advection/var-velocity.py index de1b45354..fdf2bd9ed 100644 --- a/examples/advection/var-velocity.py +++ b/examples/advection/var-velocity.py @@ -31,6 +31,7 @@ from grudge.array_context import PyOpenCLArrayContext +from grudge.grudge_array_context import GrudgeArrayContext from meshmode.dof_array import flatten from meshmode.mesh import BTAG_ALL @@ -100,6 +101,7 @@ def main(ctx_factory, dim=2, order=4, use_quad=False, visualize=False, flux_type="upwind"): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) + #actx = GrudgeArrayContext(queue) actx = PyOpenCLArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), diff --git a/examples/advection/weak.py b/examples/advection/weak.py index 3470fdd60..2c5a1333f 100644 --- a/examples/advection/weak.py +++ b/examples/advection/weak.py @@ -99,7 +99,7 @@ def __call__(self, evt, basename, overwrite=True): def main(ctx_factory, dim=2, order=4, visualize=False): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) - actx = PyOpenCLArrayContext( + actx = GrudgeArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), force_device_scalars=True, diff --git a/examples/geometry.py b/examples/geometry.py index 442bbcfff..dfa39e048 100644 --- a/examples/geometry.py +++ b/examples/geometry.py @@ -32,12 +32,14 @@ from grudge.array_context import PyOpenCLArrayContext +from grudge.grudge_array_context import GrudgeArrayContext from grudge import DiscretizationCollection, shortcuts def main(write_output=True): cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) + #actx = GrudgeArrayContext(queue) actx = PyOpenCLArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), diff --git a/examples/hello-grudge.py b/examples/hello-grudge.py index cfb724115..017430d86 100644 --- a/examples/hello-grudge.py +++ b/examples/hello-grudge.py @@ -14,7 +14,7 @@ import grudge.op as op from meshmode.mesh.generation import generate_box_mesh from meshmode.array_context import PyOpenCLArrayContext -from grudge.dof_desc import DTAG_BOUNDARY, FACE_RESTR_INTERIOR +from grudge.dof_desc import BoundaryDomainTag, FACE_RESTR_INTERIOR ctx = cl.create_some_context() @@ -51,8 +51,8 @@ def flux(dcoll, u_tpair): vol_discr = dcoll.discr_from_dd("vol") -left_bndry = DTAG_BOUNDARY("left") -right_bndry = DTAG_BOUNDARY("right") +left_bndry = BoundaryDomainTag("left") +right_bndry = BoundaryDomainTag("right") x_vol = actx.thaw(dcoll.nodes()) x_bndry = actx.thaw(dcoll.discr_from_dd(left_bndry).nodes()) diff --git a/examples/maxwell/cavities.py b/examples/maxwell/cavities.py index 3d581c18a..23870121e 100644 --- a/examples/maxwell/cavities.py +++ b/examples/maxwell/cavities.py @@ -44,6 +44,7 @@ def main(ctx_factory, dim=3, order=4, visualize=False): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) + actx = PyOpenCLArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), diff --git a/examples/wave/hjson/diff_2_axis.hjson b/examples/wave/hjson/diff_2_axis.hjson new file mode 100644 index 000000000..7f600edf8 --- /dev/null +++ b/examples/wave/hjson/diff_2_axis.hjson @@ -0,0 +1,317 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: + { + FP64: + { + 10: + [ + [ + tag_inames + [ + imatrix: ilp + ] + ] + [ + split_iname + [ + iel + 176 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 16 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 10 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 10 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 10 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 6: + [ + [ + tag_inames + [ + imatrix: ilp + ] + ] + [ + split_iname + [ + iel + 320 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 6 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 2 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 6 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 15: + [ + [ + tag_inames + [ + imatrix: ilp + ] + ] + [ + split_iname + [ + iel + 32 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 15 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 15 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 1 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + } + } +} \ No newline at end of file diff --git a/examples/wave/hjson/diff_3_axis.hjson b/examples/wave/hjson/diff_3_axis.hjson new file mode 100644 index 000000000..3069f2033 --- /dev/null +++ b/examples/wave/hjson/diff_3_axis.hjson @@ -0,0 +1,317 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: + { + FP64: + { + 20: + [ + [ + tag_inames + [ + imatrix: ilp + ] + ] + [ + split_iname + [ + iel + 80 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 16 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 20 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 20 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 2 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 10: + [ + [ + tag_inames + [ + imatrix: ilp + ] + ] + [ + split_iname + [ + iel + 128 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 16 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 10 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 5 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 10 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 35: + [ + [ + tag_inames + [ + imatrix: ilp + ] + ] + [ + split_iname + [ + iel + 32 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 35 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 7 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 1 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + } + } +} \ No newline at end of file diff --git a/examples/wave/hjson/elwise_linear.hjson b/examples/wave/hjson/elwise_linear.hjson new file mode 100644 index 000000000..b48537c2a --- /dev/null +++ b/examples/wave/hjson/elwise_linear.hjson @@ -0,0 +1,299 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: + { + FP64: + { + 20: + [ + [ + split_iname + [ + iel + 16 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 8 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 4 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 4 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 2 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 10: + [ + [ + split_iname + [ + iel + 32 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 10 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 2 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 1 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 35: + [ + [ + split_iname + [ + iel + 32 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 35 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 7 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 1 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + } + } +} \ No newline at end of file diff --git a/examples/wave/hjson/face_mass.hjson b/examples/wave/hjson/face_mass.hjson new file mode 100644 index 000000000..06a0186ce --- /dev/null +++ b/examples/wave/hjson/face_mass.hjson @@ -0,0 +1,299 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: + { + FP64: + { + 20: + [ + [ + split_iname + [ + iel + 128 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 16 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 20 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 20 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + f,j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + N1,N0,N2 + ] + ] + [ + split_iname + [ + j + 1 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 10: + [ + [ + split_iname + [ + iel + 32 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 10 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 10 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + f,j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + N1,N0,N2 + ] + ] + [ + split_iname + [ + j + 6 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + 35: + [ + [ + split_iname + [ + iel + 32 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 7 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 7 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + vec + f,j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + N1,N0,N2 + ] + ] + [ + split_iname + [ + j + 1 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + } + } +} \ No newline at end of file diff --git a/examples/wave/hjson/nodes.hjson b/examples/wave/hjson/nodes.hjson new file mode 100644 index 000000000..8c2e180c3 --- /dev/null +++ b/examples/wave/hjson/nodes.hjson @@ -0,0 +1,106 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: + { + FP64: + { + 35: [] + 15: + [ + [ + split_iname + [ + iel + 32 + ] + { + outer_tag: g.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + iel_inner + 32 + ] + { + outer_tag: ilp + inner_tag: l.0 + slabs: + [ + 0 + 1 + ] + } + ] + [ + split_iname + [ + idof + 15 + ] + { + outer_tag: g.1 + slabs: + [ + 0 + 0 + ] + } + ] + [ + split_iname + [ + idof_inner + 15 + ] + { + outer_tag: ilp + inner_tag: l.1 + slabs: + [ + 0 + 1 + ] + } + ] + [ + add_prefetch + [ + nodes + j,iel_inner_outer,iel_inner_inner + ] + { + temporary_name: vecf + default_tag: l.auto + } + ] + [ + tag_array_axes + [ + vecf + f,f + ] + ] + [ + split_iname + [ + j + 3 + ] + { + outer_tag: for + inner_tag: for + } + ] + [ + add_inames_for_unused_hw_axes + ] + ] + } + } +} \ No newline at end of file diff --git a/examples/wave/var-propagation-speed.py b/examples/wave/var-propagation-speed.py index 9929f6dbf..0e48a7405 100644 --- a/examples/wave/var-propagation-speed.py +++ b/examples/wave/var-propagation-speed.py @@ -35,7 +35,11 @@ from pytools.obj_array import flat_obj_array +<<<<<<< HEAD +from grudge.grudge_array_context import GrudgeArrayContext +======= import grudge.op as op +>>>>>>> upstream/main import logging logger = logging.getLogger(__name__) @@ -44,12 +48,18 @@ def main(ctx_factory, dim=2, order=4, visualize=False): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) +<<<<<<< HEAD + actx = GrudgeArrayContext(queue) + + dims = 3 +======= actx = PyOpenCLArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), force_device_scalars=True, ) +>>>>>>> upstream/main from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh( a=(-0.5,)*dim, diff --git a/examples/wave/wave-min-mpi.py b/examples/wave/wave-min-mpi.py index 6c56353bd..1afcef10e 100644 --- a/examples/wave/wave-min-mpi.py +++ b/examples/wave/wave-min-mpi.py @@ -26,6 +26,7 @@ import numpy as np import pyopencl as cl +from grudge.grudge_array_context import GrudgeArrayContext import pyopencl.tools as cl_tools from grudge.array_context import MPIPyOpenCLArrayContext @@ -43,10 +44,6 @@ logger = logging.getLogger(__name__) -class WaveTag: - pass - - def main(ctx_factory, dim=2, order=4, visualize=False): comm = MPI.COMM_WORLD num_parts = comm.Get_size() diff --git a/examples/wave/wave-op-mpi.py b/examples/wave/wave-op-mpi.py index 8c23336d0..c84ce2e5f 100644 --- a/examples/wave/wave-op-mpi.py +++ b/examples/wave/wave-op-mpi.py @@ -29,6 +29,9 @@ import pyopencl as cl import pyopencl.tools as cl_tools +from grudge.array_context import PyOpenCLArrayContext, PytatoPyOpenCLArrayContext +from grudge.grudge_array_context import (AutotuningArrayContext, + GrudgeArrayContext, ParameterFixingPyOpenCLArrayContext) from arraycontext import ( with_container_arithmetic, dataclass_array_container @@ -181,7 +184,22 @@ def bump(actx, dcoll, t=0): def main(ctx_factory, dim=2, order=3, visualize=False, lazy=False, use_quad=False, use_nonaffine_mesh=False): cl_ctx = ctx_factory() - queue = cl.CommandQueue(cl_ctx) + queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + if lazy: + actx = PytatoPyOpenCLArrayContext(queue) + else: + #actx = ParameterFixingPyOpenCLArrayContext( + actx = AutotuningArrayContext( + #actx = GrudgeArrayContext( + queue, + allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), + force_device_scalars=True, + ) + #actx = PyOpenCLArrayContext( + # queue, + # allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), + # force_device_scalars=True, + #) comm = MPI.COMM_WORLD num_parts = comm.Get_size() @@ -198,7 +216,9 @@ def main(ctx_factory, dim=2, order=3, from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis mesh_dist = MPIMeshDistributor(comm) - nel_1d = 16 + order=2 + dim = 3 + nel_1d = 2**5 if mesh_dist.is_mananger_rank(): if use_nonaffine_mesh: @@ -271,6 +291,8 @@ def rhs(t, w): t = 0 t_final = 3 istep = 0 + end_step = 10 + while t < t_final: start = time.time() diff --git a/examples/wave/wave-op-mpi.py.old b/examples/wave/wave-op-mpi.py.old new file mode 100644 index 000000000..ad024ec8c --- /dev/null +++ b/examples/wave/wave-op-mpi.py.old @@ -0,0 +1,251 @@ +"""Minimal example of a grudge driver.""" + +__copyright__ = """ +Copyright (C) 2020 Andreas Kloeckner +Copyright (C) 2021 University of Illinois Board of Trustees +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import numpy as np +import numpy.linalg as la # noqa +import pyopencl as cl +import pyopencl.tools as cl_tools + +from arraycontext import thaw +from grudge.array_context import PyOpenCLArrayContext + +from grudge.grudge_array_context import GrudgeArrayContext +from pytools.obj_array import flat_obj_array + +from meshmode.mesh import BTAG_ALL, BTAG_NONE # noqa + +from grudge.discretization import DiscretizationCollection +from grudge.shortcuts import make_visualizer + +import grudge.op as op + +import logging +logger = logging.getLogger(__name__) + +from mpi4py import MPI + + +# {{{ wave equation bits + +def wave_flux(dcoll, c, w_tpair): + u = w_tpair[0] + v = w_tpair[1:] + + normal = thaw(dcoll.normal(w_tpair.dd), u.int.array_context) + + flux_weak = flat_obj_array( + np.dot(v.avg, normal), + normal*u.avg, + ) + + # upwind + v_jump = np.dot(normal, v.ext-v.int) + flux_weak += flat_obj_array( + 0.5*(u.ext-u.int), + 0.5*normal*v_jump, + ) + + return op.project(dcoll, w_tpair.dd, "all_faces", c*flux_weak) + + +def wave_operator(dcoll, c, w): + u = w[0] + v = w[1:] + + dir_u = op.project(dcoll, "vol", BTAG_ALL, u) + dir_v = op.project(dcoll, "vol", BTAG_ALL, v) + dir_bval = flat_obj_array(dir_u, dir_v) + dir_bc = flat_obj_array(-dir_u, dir_v) + + return ( + op.inverse_mass( + dcoll, + flat_obj_array( + -c*op.weak_local_div(dcoll, v), + -c*op.weak_local_grad(dcoll, u) + ) + + op.face_mass( + dcoll, + wave_flux( + dcoll, c=c, + w_tpair=op.bdry_trace_pair(dcoll, + BTAG_ALL, + interior=dir_bval, + exterior=dir_bc) + ) + sum( + wave_flux(dcoll, c=c, w_tpair=tpair) + for tpair in op.interior_trace_pairs(dcoll, w) + ) + ) + ) + ) + +# }}} + + +def rk4_step(y, t, h, f): + k1 = f(t, y) + k2 = f(t+h/2, y + h/2*k1) + k3 = f(t+h/2, y + h/2*k2) + k4 = f(t+h, y + h*k3) + return y + h/6*(k1 + 2*k2 + 2*k3 + k4) + + +def estimate_rk4_timestep(actx, dcoll, c): + from grudge.dt_utils import characteristic_lengthscales + + local_dts = characteristic_lengthscales(actx, dcoll) / c + + return op.nodal_min(dcoll, "vol", local_dts) + + +def bump(actx, dcoll, t=0): + source_center = np.array([0.2, 0.35, 0.1])[:dcoll.dim] + source_width = 0.05 + source_omega = 3 + + nodes = thaw(dcoll.nodes(), actx) + center_dist = flat_obj_array([ + nodes[i] - source_center[i] + for i in range(dcoll.dim) + ]) + + return ( + np.cos(source_omega*t) + * actx.np.exp( + -np.dot(center_dist, center_dist) + / source_width**2)) + + +def main(ctx_factory, dim=2, order=3, visualize=False): + cl_ctx = ctx_factory() + queue = cl.CommandQueue(cl_ctx) + #actx = GrudgeArrayContext(queue) + actx = PyOpenCLArrayContext( + queue, + allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), + force_device_scalars=True, + ) + + comm = MPI.COMM_WORLD + num_parts = comm.Get_size() + + from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis + mesh_dist = MPIMeshDistributor(comm) + + nel_1d = 16 + + if mesh_dist.is_mananger_rank(): + from meshmode.mesh.generation import generate_regular_rect_mesh + mesh = generate_regular_rect_mesh( + a=(-0.5,)*dim, + b=(0.5,)*dim, + nelements_per_axis=(nel_1d,)*dim) + + logger.info("%d elements", mesh.nelements) + + part_per_element = get_partition_by_pymetis(mesh, num_parts) + + local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) + + del mesh + + else: + local_mesh = mesh_dist.receive_mesh_part() + + dcoll = DiscretizationCollection(actx, local_mesh, order=order, + mpi_communicator=comm) + + fields = flat_obj_array( + bump(actx, dcoll), + [dcoll.zeros(actx) for i in range(dcoll.dim)] + ) + + c = 1 + dt = 0.45 * estimate_rk4_timestep(actx, dcoll, c) + + vis = make_visualizer(dcoll) + + def rhs(t, w): + return wave_operator(dcoll, c=c, w=w) + + if comm.rank == 0: + logger.info("dt = %g", dt) + + t = 0 + t_final = 3 + istep = 0 + while t < t_final: + fields = rk4_step(fields, t, dt, rhs) + + l2norm = op.norm(dcoll, fields[0], 2) + + if istep % 10 == 0: + linfnorm = op.norm(dcoll, fields[0], np.inf) + nodalmax = op.nodal_max(dcoll, "vol", fields[0]) + nodalmin = op.nodal_min(dcoll, "vol", fields[0]) + if comm.rank == 0: + logger.info(f"step: {istep} t: {t} " + f"L2: {l2norm} " + f"Linf: {linfnorm} " + f"sol max: {nodalmax} " + f"sol min: {nodalmin}") + if visualize: + vis.write_parallel_vtk_file( + comm, + f"fld-wave-eager-mpi-{{rank:03d}}-{istep:04d}.vtu", + [ + ("u", fields[0]), + ("v", fields[1:]), + ] + ) + + t += dt + istep += 1 + + # NOTE: These are here to ensure the solution is bounded for the + # time interval specified + assert l2norm < 1 + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--dim", default=2, type=int) + parser.add_argument("--order", default=3, type=int) + parser.add_argument("--visualize", action="store_true") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO) + main(cl.create_some_context, + dim=args.dim, + order=args.order, + visualize=args.visualize) + +# vim: foldmethod=marker diff --git a/examples/wave/wave-op-var-velocity.py b/examples/wave/wave-op-var-velocity.py index 43c72eff9..3b7915934 100644 --- a/examples/wave/wave-op-var-velocity.py +++ b/examples/wave/wave-op-var-velocity.py @@ -31,6 +31,7 @@ from grudge.array_context import PyOpenCLArrayContext +from grudge.grudge_array_context import GrudgeArrayContext, AutoTuningArrayContext from pytools.obj_array import flat_obj_array from meshmode.mesh import BTAG_ALL, BTAG_NONE # noqa @@ -150,8 +151,8 @@ def bump(actx, dcoll, t=0, width=0.05, center=None): def main(ctx_factory, dim=2, order=3, visualize=False): cl_ctx = ctx_factory() - queue = cl.CommandQueue(cl_ctx) - actx = PyOpenCLArrayContext( + queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + actx = GrudgeArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), force_device_scalars=True, diff --git a/examples/wave/wave-op.py b/examples/wave/wave-op.py new file mode 100644 index 000000000..43622cac0 --- /dev/null +++ b/examples/wave/wave-op.py @@ -0,0 +1,402 @@ +__copyright__ = "Copyright (C) 2020 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import numpy as np +import numpy.linalg as la # noqa +import pyopencl as cl + +from pytools.obj_array import flat_obj_array + +from grudge.grudge_array_context import GrudgeArrayContext, AutoTuningArrayContext +from meshmode.array_context import PyOpenCLArrayContext # noqa F401 +from meshmode.dof_array import thaw + +from meshmode.mesh import BTAG_ALL, BTAG_NONE # noqa + +from grudge.discretization import DiscretizationCollection +import grudge.op as op +from grudge.shortcuts import make_visualizer +from grudge.symbolic.primitives import TracePair +from time import time + +# {{{ wave equation bits + +def wave_flux(dcoll, c, w_tpair): + u = w_tpair[0] + v = w_tpair[1:] + + normal = thaw(u.int.array_context, op.normal(dcoll, w_tpair.dd)) + + flux_weak = flat_obj_array( + np.dot(v.avg, normal), + normal*u.avg, + ) + + # upwind + flux_weak += flat_obj_array( + 0.5*(u.ext-u.int), + 0.5*normal*np.dot(normal, v.ext-v.int), + ) + + return op.project(dcoll, w_tpair.dd, "all_faces", c*flux_weak) + +#''' +def wave_operator(discr, c, w): + from pyopencl import MemoryError + from pyopencl.array import Array + try: + + u = w[0] + v = w[1:] + + dir_u = op.project(discr, "vol", BTAG_ALL, u) + dir_v = op.project(discr, "vol", BTAG_ALL, v) + dir_bval = flat_obj_array(dir_u, dir_v) + neg_dir_u = -dir_u; del dir_u + dir_bc = flat_obj_array(neg_dir_u, dir_v) + #print(discr._discr_scoped_subexpr_name_to_value.keys()) + div = op.weak_local_div(discr,v) + + #print(discr._discr_scoped_subexpr_name_to_value.keys()) + + neg_c_div = (-c)*div; del div + + #print(discr._discr_scoped_subexpr_name_to_value.keys()) + grad = op.weak_local_grad(discr,u) + + neg_c_grad = (-c)*grad; del grad + obj_array = flat_obj_array(neg_c_div, neg_c_grad) + + trace_pair1 = op.interior_trace_pair(discr, w) + wave_flux1 = wave_flux(discr, c=c, w_tpair=trace_pair1) + del trace_pair1 + + trace_pair2 = TracePair(BTAG_ALL, interior=dir_bval, exterior=dir_bc) + wave_flux2 = wave_flux(discr, c=c, w_tpair=trace_pair2) + del trace_pair2 + del dir_bc + del neg_dir_u + del dir_v + del dir_bval + + wave_flux_sum = wave_flux1 + wave_flux2; + """ + print("####################") + print(type(wave_flux_sum)) + for entry in wave_flux_sum: + print(type(entry)) + print(entry._data.shape) + """ + + del wave_flux1 + del wave_flux2 + + face_mass = op.face_mass(discr, wave_flux_sum) + del wave_flux_sum + + inverse_arg = obj_array + face_mass + """ + print("@@@@@@@@@@@@@@@@@@@@@") + print(type(inverse_arg)) + for entry in inverse_arg: + print(type(entry)) + print(type(entry._data)) + print(len(entry._data)) + print(entry._data[0].shape) + exit() + """ + + del obj_array + del face_mass + del neg_c_div + del neg_c_grad + + result = op.inverse_mass(discr,inverse_arg) + del inverse_arg + + """ + # Original version + dir_u = discr.project("vol", BTAG_ALL, u) + dir_v = discr.project("vol", BTAG_ALL, v) + dir_bval = flat_obj_array(dir_u, dir_v) + dir_bc = flat_obj_array(-dir_u, dir_v) + + return ( + discr.inverse_mass( + flat_obj_array( + -c*discr.weak_div(v), + -c*discr.weak_grad(u) + ) + + # noqa: W504 + discr.face_mass( + wave_flux(discr, c=c, w_tpair=op.interior_trace_pair(discr, w)) + + wave_flux(discr, c=c, w_tpair=TracePair( + BTAG_ALL, interior=dir_bval, exterior=dir_bc)) + )) + ) + """ + + from time import sleep + sleep(3) + #print_allocated_arrays() + + scoped = discr._discr_scoped_subexpr_name_to_value + print(len(scoped.items())) + print(scoped.keys()) + sum = 0 + for value in scoped.values(): + #print(type(value)) + if isinstance(value._data, tuple): + for entry in value._data: + print(entry.shape) + sum += entry.shape[0]*entry.shape[1]*8 + else: + print(value._data.shape) + sum += value._data.shape[0]*value_data.shape[1]*8 + print(sum / 1e9) + #exit() + + except MemoryError: + for key, value in Array.alloc_dict.items(): + print("{} {}".format(key, value[1]/1e9)) + for entry in value[0]: + print(entry) + print() + exit() + + + return (result) +#''' + +""" +def wave_operator(dcoll, c, w): + u = w[0] + v = w[1:] + + dir_u = op.project(dcoll, "vol", BTAG_ALL, u) + dir_v = op.project(dcoll, "vol", BTAG_ALL, v) + dir_bval = flat_obj_array(dir_u, dir_v) + dir_bc = flat_obj_array(-dir_u, dir_v) + + return ( + op.inverse_mass(dcoll, + flat_obj_array( + -c*op.weak_local_div(dcoll, v), + -c*op.weak_local_grad(dcoll, u) + ) + + # noqa: W504 + op.face_mass(dcoll, + wave_flux(dcoll, c=c, w_tpair=op.interior_trace_pair(dcoll, w)) + + wave_flux(dcoll, c=c, w_tpair=TracePair( + BTAG_ALL, interior=dir_bval, exterior=dir_bc)) + )) + ) +""" +# }}} + + +def rk4_step(y, t, h, f): + k1 = f(t, y) + kSum = k1 + h2k1 = (h/2)*k1 + del k1 + yph2k1 = y + h2k1 + del h2k1 + k2 = f(t+h/2, y + yph2k1) + #k2 = f(t+h/2, y + h/2*k1) + twok2 = 2*k2 + kSum = kSum + twok2 + del twok2 + h2k2 = (h/2)*k2 + del k2 + yph2k2 = y + h2k2 + k3 = f(t+h/2, yph2k2) + #k3 = f(t+h/2, y + h/2*k2) + twok3 = 2*k3 + kSum = kSum + twok3 + del twok3 + hk3 = h*k3 + del k3 + yphk3 = y + hk3 + del hk3 + k4 = f(t+h, yphk3) + kSum = kSum + k4 + del k4 + h6kSum = (h/6)*kSum + del kSum + return y + h6kSum + #return y + h/6*(k1 + 2*k2 + 2*k3 + k4) + + +def bump(actx, dcoll, t=0): + source_center = np.array([0.2, 0.35, 0.1])[:dcoll.dim] + source_width = 0.05 + source_omega = 3 + + nodes = thaw(actx, dcoll.nodes()) + center_dist = flat_obj_array([ + nodes[i] - source_center[i] + for i in range(dcoll.dim) + ]) + + return ( + np.cos(source_omega*t) + * actx.np.exp( + -np.dot(center_dist, center_dist) + / source_width**2)) + + +def main(): + cl_ctx = cl.create_some_context() + queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + from pyopencl.tools import ImmediateAllocator + actx = AutoTuningArrayContext(queue, allocator=ImmediateAllocator(queue)) + from meshmode.mesh.generation import generate_regular_rect_mesh + + dim = 3 + order = 2 + + #nel_1d = 2**5 + #mesh = generate_regular_rect_mesh( + # coord_dtype=np.float64, + # a=(-0.5,)*dim, + # b=(0.5,)*dim, + # nelements_per_axis=(nel_1d,)*dim) + #print(mesh.nelements) + + #exit() + + #target_num_points = 11010048 + #target_num_points = 9000000 + target_num_points = 6000000 # Order fails assertion with more than this + order_points_mapping = {2:10, 3:20, 4:35, 5:56, 6:84, 7:120} + + + cur_points = 0 + cur_points_old = 0 + nel_1d = 0 + mesh_old = None + mesh = None + while cur_points < target_num_points: + print(cur_points) + nel_1d += 1 + mesh_old = mesh + mesh = generate_regular_rect_mesh( + coord_dtype=np.float64, + a=(-0.5,)*dim, + b=(0.5,)*dim, + nelements_per_axis=(nel_1d,)*dim) + cur_points_old = cur_points + cur_points = order_points_mapping[order]*mesh.nelements + + # Pick whichever is closer + if (target_num_points - cur_points_old) < (cur_points - target_num_points): + mesh = mesh_old + nel_1d -= 1 + + print(mesh.nelements) + #exit() + + #nel_1d = #2**5 # Order 6 runs out of memory with 2**5 + + #for nel_1d in 2**np.arange(6,dtype=np.int32): + #from meshmode.mesh.generation import generate_regular_rect_mesh + #mesh = generate_regular_rect_mesh( + # coord_dtype=np.float64, + # a=(-0.5,)*dim, + # b=(0.5,)*dim, + # nelements_per_axis=(nel_1d,)*dim) + + #print("%d elements" % mesh.nelements) + #print(mesh.nelements*np.array([10,20,35,56,84,120])) + + #exit() + + if dim == 2: + # no deep meaning here, just a fudge factor + dt = 0.7/(nel_1d*order**2) + elif dim == 3: + # no deep meaning here, just a fudge factor + dt = 0.45/(nel_1d*order**2) + else: + raise ValueError("don't have a stable time step guesstimate") + + + dcoll = DiscretizationCollection(actx, mesh, order=order) + + fields = flat_obj_array( + bump(actx, dcoll), + [dcoll.zeros(actx) for i in range(dcoll.dim)] + ) + + vis = make_visualizer(dcoll) + + for field in fields: + print(field[0][0].shape) + + def rhs(t, w): + return wave_operator(dcoll, c=1, w=w) + + t = 0 + t_final = (21)*dt + istep = 0 + start = time() + nsteps = 0 + + nelements, ndofs = fields[0][0].shape + npts = nelements*ndofs + print(npts) + #exit() + + while t < t_final: + + print(f"===========TIME STEP {istep}===========") + fields = rk4_step(fields, t, dt, rhs) + + if istep % 100 == 0: + print(f"step: {istep} t: {t} L2: {op.norm(dcoll, fields[0], 2)} " + f"sol max: {op.nodal_max(dcoll, 'vol', fields[0])}") + vis.write_vtk_file("fld-wave-eager-%04d.vtu" % istep, + [ + ("u", fields[0]), + ("v", fields[1:]), + ]) + + print(f"===========END TIME STEP {istep}===========") + istep += 1 + t = istep*dt + nsteps += 1 + + # Should compare against base version at some point + #assert op.norm(dcoll, fields[0], 2) < 1 + end = time() + diff = end - start + nelements, ndofs = fields[0][0].shape + npts = nelements*ndofs + time_per_timestep_per_point = diff / nsteps / npts + print(f"AVERAGE STEP TIME PER POINT: {time_per_timestep_per_point}") + +if __name__ == "__main__": + main() + +# vim: foldmethod=marker diff --git a/grudge/__init__.py b/grudge/__init__.py index aad8dbd1c..fa4a2b3b7 100644 --- a/grudge/__init__.py +++ b/grudge/__init__.py @@ -20,8 +20,9 @@ THE SOFTWARE. """ -from grudge.discretization import DiscretizationCollection +from grudge.discretization import ( + DiscretizationCollection, make_discretization_collection) __all__ = [ - "DiscretizationCollection" + "DiscretizationCollection", "make_discretization_collection" ] diff --git a/grudge/discretization.py b/grudge/discretization.py index 43bd24226..9cb9f5a3f 100644 --- a/grudge/discretization.py +++ b/grudge/discretization.py @@ -1,7 +1,13 @@ """ -.. currentmodule:: grudge +.. autoclass:: DiscretizationTag + +.. currentmodule:: grudge .. autoclass:: DiscretizationCollection +.. autofunction:: make_discretization_collection + +.. currentmodule:: grudge.discretization +.. autoclass:: PartID """ __copyright__ = """ @@ -29,30 +35,174 @@ THE SOFTWARE. """ -from pytools import memoize_method +from typing import Sequence, Mapping, Optional, Union, Tuple, TYPE_CHECKING, Any + +from pytools import memoize_method, single_valued + +from dataclasses import dataclass, replace from grudge.dof_desc import ( - DD_VOLUME, - DISCR_TAG_BASE, - DISCR_TAG_MODAL, - DTAG_BOUNDARY, - DOFDesc, - as_dofdesc + VTAG_ALL, + DD_VOLUME_ALL, + DISCR_TAG_BASE, + DISCR_TAG_MODAL, + VolumeDomainTag, BoundaryDomainTag, + DOFDesc, + VolumeTag, DomainTag, + DiscretizationTag, + as_dofdesc, + ConvertibleToDOFDesc ) import numpy as np # noqa: F401 from arraycontext import ArrayContext +from meshmode.discretization import ElementGroupFactory, Discretization from meshmode.discretization.connection import ( FACE_RESTR_INTERIOR, FACE_RESTR_ALL, - make_face_restriction + make_face_restriction, + DiscretizationConnection ) from meshmode.mesh import Mesh, BTAG_PARTITION +from meshmode.dof_array import DOFArray from warnings import warn +if TYPE_CHECKING: + import mpi4py.MPI + + +@dataclass(frozen=True) +class PartID: + """Unique identifier for a piece of a partitioned mesh. + + .. attribute:: volume_tag + + The volume of the part. + + .. attribute:: rank + + The (optional) MPI rank of the part. + + """ + volume_tag: VolumeTag + rank: Optional[int] = None + + +# {{{ part ID normalization + +def _normalize_mesh_part_ids( + mesh: Mesh, + volume_tags: Sequence[VolumeTag], + mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None): + """Convert a mesh's configuration-dependent "part ID" into a fixed type.""" + from numbers import Integral + if VTAG_ALL not in volume_tags: + # Multi-volume + if mpi_communicator is not None: + # Accept PartID + def as_part_id(mesh_part_id): + if isinstance(mesh_part_id, PartID): + return mesh_part_id + else: + raise TypeError(f"Unable to convert {mesh_part_id} to PartID.") + else: + # Accept PartID or volume tag + def as_part_id(mesh_part_id): + if isinstance(mesh_part_id, PartID): + return mesh_part_id + elif mesh_part_id in volume_tags: + return PartID(mesh_part_id) + else: + raise TypeError(f"Unable to convert {mesh_part_id} to PartID.") + else: + # Single-volume + if mpi_communicator is not None: + # Accept PartID or rank + def as_part_id(mesh_part_id): + if isinstance(mesh_part_id, PartID): + return mesh_part_id + elif isinstance(mesh_part_id, Integral): + return PartID(VTAG_ALL, int(mesh_part_id)) + else: + raise TypeError(f"Unable to convert {mesh_part_id} to PartID.") + else: + # Shouldn't be called + def as_part_id(mesh_part_id): + raise TypeError(f"Unable to convert {mesh_part_id} to PartID.") + + facial_adjacency_groups = mesh.facial_adjacency_groups + + new_facial_adjacency_groups = [] + + from meshmode.mesh import InterPartAdjacencyGroup + for grp_list in facial_adjacency_groups: + new_grp_list = [] + for fagrp in grp_list: + if isinstance(fagrp, InterPartAdjacencyGroup): + part_id = as_part_id(fagrp.part_id) + new_fagrp = replace( + fagrp, + boundary_tag=BTAG_PARTITION(part_id), + part_id=part_id) + else: + new_fagrp = fagrp + new_grp_list.append(new_fagrp) + new_facial_adjacency_groups.append(new_grp_list) + + return mesh.copy(facial_adjacency_groups=new_facial_adjacency_groups) + +# }}} + + +# {{{ discr_tag_to_group_factory normalization + +def _normalize_discr_tag_to_group_factory( + dim: int, + discr_tag_to_group_factory: Optional[ + Mapping[DiscretizationTag, ElementGroupFactory]], + order: Optional[int] + ) -> Mapping[DiscretizationTag, ElementGroupFactory]: + from meshmode.discretization.poly_element import \ + default_simplex_group_factory + + if discr_tag_to_group_factory is None: + if order is None: + raise TypeError( + "one of 'order' and 'discr_tag_to_group_factory' must be given" + ) + + discr_tag_to_group_factory = { + DISCR_TAG_BASE: default_simplex_group_factory( + base_dim=dim, order=order)} + else: + discr_tag_to_group_factory = dict(discr_tag_to_group_factory) + + if order is not None: + if DISCR_TAG_BASE in discr_tag_to_group_factory: + raise ValueError( + "if 'order' is given, 'discr_tag_to_group_factory' must " + "not have a key of DISCR_TAG_BASE" + ) + + discr_tag_to_group_factory[DISCR_TAG_BASE] = \ + default_simplex_group_factory(base_dim=dim, order=order) + + assert discr_tag_to_group_factory is not None + + # Modal discr should always come from the base discretization + if DISCR_TAG_MODAL not in discr_tag_to_group_factory: + discr_tag_to_group_factory[DISCR_TAG_MODAL] = \ + _generate_modal_group_factory( + discr_tag_to_group_factory[DISCR_TAG_BASE] + ) + + return discr_tag_to_group_factory + +# }}} + class DiscretizationCollection: """A collection of discretizations, defined on the same underlying @@ -60,11 +210,13 @@ class DiscretizationCollection: (volume, interior facets, boundaries) and associated element groups. - .. automethod:: __init__ + .. note:: + + Do not call the constructor directly. Use + :func:`make_discretization_collection` instead. .. autoattribute:: dim .. autoattribute:: ambient_dim - .. autoattribute:: mesh .. autoattribute:: real_dtype .. autoattribute:: complex_dtype @@ -84,11 +236,16 @@ class DiscretizationCollection: # {{{ constructor - def __init__(self, array_context: ArrayContext, mesh: Mesh, - order=None, - discr_tag_to_group_factory=None, mpi_communicator=None, - # FIXME: `quad_tag_to_group_factory` is deprecated - quad_tag_to_group_factory=None): + def __init__(self, array_context: ArrayContext, + volume_discrs: Union[Mesh, Mapping[VolumeTag, Discretization]], + order: Optional[int] = None, + discr_tag_to_group_factory: Optional[ + Mapping[DiscretizationTag, ElementGroupFactory]] = None, + mpi_communicator: Optional["mpi4py.MPI.Intracomm"] = None, + inter_part_connections: Optional[ + Mapping[Tuple[PartID, PartID], + DiscretizationConnection]] = None, + ) -> None: """ :arg discr_tag_to_group_factory: A mapping from discretization tags (typically one of: :class:`grudge.dof_desc.DISCR_TAG_BASE`, @@ -101,63 +258,8 @@ def __init__(self, array_context: ArrayContext, mesh: Mesh, discretization. """ - if (quad_tag_to_group_factory is not None - and discr_tag_to_group_factory is not None): - raise ValueError( - "Both `quad_tag_to_group_factory` and `discr_tag_to_group_factory` " - "are specified. Use `discr_tag_to_group_factory` instead." - ) - - # FIXME: `quad_tag_to_group_factory` is deprecated - if (quad_tag_to_group_factory is not None - and discr_tag_to_group_factory is None): - warn("`quad_tag_to_group_factory` is a deprecated kwarg and will " - "be dropped in version 2022.x. Use `discr_tag_to_group_factory` " - "instead.", - DeprecationWarning, stacklevel=2) - discr_tag_to_group_factory = quad_tag_to_group_factory - self._setup_actx = array_context.clone() - from meshmode.discretization.poly_element import \ - default_simplex_group_factory - - if discr_tag_to_group_factory is None: - if order is None: - raise TypeError( - "one of 'order' and 'discr_tag_to_group_factory' must be given" - ) - - discr_tag_to_group_factory = { - DISCR_TAG_BASE: default_simplex_group_factory( - base_dim=mesh.dim, order=order)} - else: - if order is not None: - discr_tag_to_group_factory = discr_tag_to_group_factory.copy() - if DISCR_TAG_BASE in discr_tag_to_group_factory: - raise ValueError( - "if 'order' is given, 'discr_tag_to_group_factory' must " - "not have a key of DISCR_TAG_BASE" - ) - - discr_tag_to_group_factory[DISCR_TAG_BASE] = \ - default_simplex_group_factory(base_dim=mesh.dim, order=order) - - # Modal discr should always come from the base discretization - discr_tag_to_group_factory[DISCR_TAG_MODAL] = \ - _generate_modal_group_factory( - discr_tag_to_group_factory[DISCR_TAG_BASE] - ) - - self.discr_tag_to_group_factory = discr_tag_to_group_factory - - from meshmode.discretization import Discretization - - self._volume_discr = Discretization( - array_context, mesh, - self.group_factory_for_discretization_tag(DISCR_TAG_BASE) - ) - # {{{ process mpi_communicator argument if mpi_communicator is not None: @@ -181,9 +283,60 @@ def __init__(self, array_context: ArrayContext, mesh: Mesh, # }}} - self._dist_boundary_connections = \ - self._set_up_distributed_communication( - mpi_communicator, array_context) + from meshmode.discretization import Discretization + + if isinstance(volume_discrs, Mesh): + # {{{ deprecated backward compatibility yuck + + warn("Calling the DiscretizationCollection constructor directly " + "is deprecated, call make_discretization_collection " + "instead. This will stop working in 2023.", + DeprecationWarning, stacklevel=2) + + mesh = volume_discrs + + mesh = _normalize_mesh_part_ids( + mesh, [VTAG_ALL], mpi_communicator=mpi_communicator) + + discr_tag_to_group_factory = _normalize_discr_tag_to_group_factory( + dim=mesh.dim, + discr_tag_to_group_factory=discr_tag_to_group_factory, + order=order) + self._discr_tag_to_group_factory = discr_tag_to_group_factory + + volume_discr = Discretization( + array_context, mesh, + self.group_factory_for_discretization_tag(DISCR_TAG_BASE)) + volume_discrs = {VTAG_ALL: volume_discr} + + del mesh + + if inter_part_connections is not None: + raise TypeError("may not pass inter_part_connections when " + "DiscretizationCollection constructor is called in " + "legacy mode") + + self._inter_part_connections = \ + _set_up_inter_part_connections( + array_context=self._setup_actx, + mpi_communicator=mpi_communicator, + volume_discrs=volume_discrs, + base_group_factory=( + discr_tag_to_group_factory[DISCR_TAG_BASE])) + + # }}} + else: + assert discr_tag_to_group_factory is not None + self._discr_tag_to_group_factory = discr_tag_to_group_factory + + if inter_part_connections is None: + raise TypeError("inter_part_connections must be passed when " + "DiscretizationCollection constructor is called in " + "'modern' mode") + + self._inter_part_connections = inter_part_connections + + self._volume_discrs = volume_discrs # }}} @@ -196,16 +349,6 @@ def mpi_communicator(self): return self._mpi_communicator - @property - def quad_tag_to_group_factory(self): - warn("`DiscretizationCollection.quad_tag_to_group_factory` " - "is deprecated and will go away in 2022. Use " - "`DiscretizationCollection.discr_tag_to_group_factory` " - "instead.", - DeprecationWarning, stacklevel=2) - - return self.discr_tag_to_group_factory - def get_management_rank_index(self): return 0 @@ -216,86 +359,12 @@ def is_management_rank(self): return self.mpi_communicator.Get_rank() \ == self.get_management_rank_index() - # {{{ distributed - - def _set_up_distributed_communication(self, mpi_communicator, array_context): - from_dd = DOFDesc("vol", DISCR_TAG_BASE) - - boundary_connections = {} - - from meshmode.distributed import get_connected_partitions - connected_parts = get_connected_partitions(self._volume_discr.mesh) - - if connected_parts: - if mpi_communicator is None: - raise RuntimeError("must supply an MPI communicator when using a " - "distributed mesh") - - grp_factory = \ - self.group_factory_for_discretization_tag(DISCR_TAG_BASE) - - local_boundary_connections = {} - for i_remote_part in connected_parts: - local_boundary_connections[i_remote_part] = self.connection_from_dds( - from_dd, DOFDesc(BTAG_PARTITION(i_remote_part), - DISCR_TAG_BASE)) - - from meshmode.distributed import MPIBoundaryCommSetupHelper - with MPIBoundaryCommSetupHelper(mpi_communicator, array_context, - local_boundary_connections, grp_factory) as bdry_setup_helper: - while True: - conns = bdry_setup_helper.complete_some() - if not conns: - break - for i_remote_part, conn in conns.items(): - boundary_connections[i_remote_part] = conn - - return boundary_connections - - def get_distributed_boundary_swap_connection(self, dd): - warn("`DiscretizationCollection.get_distributed_boundary_swap_connection` " - "is deprecated and will go away in 2022. Use " - "`DiscretizationCollection.distributed_boundary_swap_connection` " - "instead.", - DeprecationWarning, stacklevel=2) - return self.distributed_boundary_swap_connection(dd) - - def distributed_boundary_swap_connection(self, dd): - """Provides a mapping from the base volume discretization - to the exterior boundary restriction on a parallel boundary - partition described by *dd*. This connection is used to - communicate across element boundaries in different parallel - partitions during distributed runs. - - :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value - convertible to one. The domain tag must be a subclass - of :class:`grudge.dof_desc.DTAG_BOUNDARY` with an - associated :class:`meshmode.mesh.BTAG_PARTITION` - corresponding to a particular communication rank. - """ - if dd.discretization_tag is not DISCR_TAG_BASE: - # FIXME - raise NotImplementedError( - "Distributed communication with discretization tag " - f"{dd.discretization_tag} is not implemented." - ) - - assert isinstance(dd.domain_tag, DTAG_BOUNDARY) - assert isinstance(dd.domain_tag.tag, BTAG_PARTITION) - - return self._dist_boundary_connections[dd.domain_tag.tag.part_nr] - - # }}} - # {{{ discr_from_dd @memoize_method - def discr_from_dd(self, dd): + def discr_from_dd(self, dd: "ConvertibleToDOFDesc") -> Discretization: """Provides a :class:`meshmode.discretization.Discretization` object from *dd*. - - :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value - convertible to one. """ dd = as_dofdesc(dd) @@ -305,45 +374,43 @@ def discr_from_dd(self, dd): return self._modal_discr(dd.domain_tag) if dd.is_volume(): - if discr_tag is not DISCR_TAG_BASE: - return self._discr_tag_volume_discr(discr_tag) - return self._volume_discr + return self._volume_discr_from_dd(dd) if discr_tag is not DISCR_TAG_BASE: - no_quad_discr = self.discr_from_dd(DOFDesc(dd.domain_tag)) + base_discr = self.discr_from_dd(dd.with_discr_tag(DISCR_TAG_BASE)) from meshmode.discretization import Discretization return Discretization( self._setup_actx, - no_quad_discr.mesh, + base_discr.mesh, self.group_factory_for_discretization_tag(discr_tag) ) assert discr_tag is DISCR_TAG_BASE - if dd.domain_tag is FACE_RESTR_ALL: - return self._all_faces_volume_connection().to_discr - elif dd.domain_tag is FACE_RESTR_INTERIOR: - return self._interior_faces_connection().to_discr - elif dd.is_boundary_or_partition_interface(): - return self._boundary_connection(dd.domain_tag.tag).to_discr + if isinstance(dd.domain_tag, BoundaryDomainTag): + if dd.domain_tag.tag in [FACE_RESTR_ALL, FACE_RESTR_INTERIOR]: + return self._faces_connection(dd.domain_tag).to_discr + else: + return self._boundary_connection(dd.domain_tag).to_discr else: - raise ValueError("DOF desc tag not understood: " + str(dd)) + raise ValueError(f"DOF desc not understood: {dd}") # }}} # {{{ _base_to_geoderiv_connection @memoize_method - def _has_affine_groups(self): + def _has_affine_groups(self, domain_tag: DomainTag) -> bool: from modepy.shapes import Simplex + discr = self.discr_from_dd(DOFDesc(domain_tag, DISCR_TAG_BASE)) return any( megrp.is_affine and issubclass(megrp._modepy_shape_cls, Simplex) - for megrp in self._volume_discr.mesh.groups) + for megrp in discr.mesh.groups) @memoize_method - def _base_to_geoderiv_connection(self, dd: DOFDesc): + def _base_to_geoderiv_connection(self, dd: DOFDesc) -> DiscretizationConnection: r"""The "geometry derivatives" discretization for a given *dd* is typically identical to the one returned by :meth:`discr_from_dd`, however for affinely-mapped simplicial elements, it will use a @@ -356,7 +423,7 @@ def _base_to_geoderiv_connection(self, dd: DOFDesc): :mod:`grudge`. """ base_discr = self.discr_from_dd(dd) - if not self._has_affine_groups(): + if not self._has_affine_groups(dd.domain_tag): # no benefit to having another discretization that takes # advantage of affine-ness from meshmode.discretization.connection import \ @@ -393,7 +460,9 @@ def geo_group_factory(megrp, index): # {{{ connection_from_dds @memoize_method - def connection_from_dds(self, from_dd, to_dd): + def connection_from_dds( + self, from_dd: "ConvertibleToDOFDesc", to_dd: "ConvertibleToDOFDesc" + ) -> DiscretizationConnection: """Provides a mapping (connection) from one discretization to another, e.g. from the volume to the boundary, or from the base to the an overintegrated quadrature discretization, or from @@ -425,12 +494,15 @@ def connection_from_dds(self, from_dd, to_dd): assert (to_discr_tag is not DISCR_TAG_MODAL and from_discr_tag is not DISCR_TAG_MODAL) - if (not from_dd.is_volume() + if (isinstance(from_dd.domain_tag, BoundaryDomainTag) and from_discr_tag == to_discr_tag - and to_dd.domain_tag is FACE_RESTR_ALL): + and isinstance(to_dd.domain_tag, BoundaryDomainTag) + and to_dd.domain_tag.tag is FACE_RESTR_ALL): faces_conn = self.connection_from_dds( - DOFDesc("vol"), - DOFDesc(from_dd.domain_tag)) + DOFDesc( + VolumeDomainTag(from_dd.domain_tag.volume_tag), + DISCR_TAG_BASE), + from_dd.with_discr_tag(DISCR_TAG_BASE)) from meshmode.discretization.connection import \ make_face_to_all_faces_embedding @@ -448,7 +520,7 @@ def connection_from_dds(self, from_dd, to_dd): from meshmode.discretization.connection import \ ChainedDiscretizationConnection - intermediate_dd = DOFDesc(to_dd.domain_tag) + intermediate_dd = to_dd.with_discr_tag(DISCR_TAG_BASE) return ChainedDiscretizationConnection( [ # first change domain @@ -482,73 +554,79 @@ def connection_from_dds(self, from_dd, to_dd): # }}} if from_discr_tag is not DISCR_TAG_BASE: - raise ValueError("cannot interpolate *from* a " - "(non-interpolatory) quadrature grid") + raise ValueError("cannot get a connection *from* a " + f"(non-interpolatory) quadrature grid: '{from_dd}'") assert to_discr_tag is DISCR_TAG_BASE - if from_dd.is_volume(): - if to_dd.domain_tag is FACE_RESTR_ALL: - return self._all_faces_volume_connection() - if to_dd.domain_tag is FACE_RESTR_INTERIOR: - return self._interior_faces_connection() - elif to_dd.is_boundary_or_partition_interface(): - assert from_discr_tag is DISCR_TAG_BASE - return self._boundary_connection(to_dd.domain_tag.tag) + if isinstance(from_dd.domain_tag, VolumeDomainTag): + if isinstance(to_dd.domain_tag, BoundaryDomainTag): + if to_dd.domain_tag.volume_tag != from_dd.domain_tag.tag: + raise ValueError("cannot get a connection from one volume " + f"('{from_dd.domain_tag.tag}') " + "to the boundary of another volume " + f"('{to_dd.domain_tag.volume_tag}') ") + if to_dd.domain_tag.tag in [FACE_RESTR_ALL, FACE_RESTR_INTERIOR]: + return self._faces_connection(to_dd.domain_tag) + elif isinstance(to_dd.domain_tag, BoundaryDomainTag): + assert from_discr_tag is DISCR_TAG_BASE + return self._boundary_connection(to_dd.domain_tag) elif to_dd.is_volume(): + if to_dd.domain_tag != from_dd.domain_tag: + raise ValueError("cannot get a connection between " + "volumes of different tags: requested " + f"'{from_dd.domain_tag}' -> '{to_dd.domain_tag}'") + from meshmode.discretization.connection import \ make_same_mesh_connection - to_discr = self._discr_tag_volume_discr(to_discr_tag) - from_discr = self._volume_discr - return make_same_mesh_connection(self._setup_actx, to_discr, - from_discr) + return make_same_mesh_connection( + self._setup_actx, + self._volume_discr_from_dd(to_dd), + self._volume_discr_from_dd(from_dd)) else: - raise ValueError("cannot interpolate from volume to: " + str(to_dd)) + raise ValueError( + f"cannot get a connection from volume to: '{to_dd}'") else: - raise ValueError("cannot interpolate from: " + str(from_dd)) + raise ValueError(f"cannot get a connection from: '{from_dd}'") # }}} # {{{ group_factory_for_discretization_tag - def group_factory_for_quadrature_tag(self, discretization_tag): - warn("`DiscretizationCollection.group_factory_for_quadrature_tag` " - "is deprecated and will go away in 2022. Use " - "`DiscretizationCollection.group_factory_for_discretization_tag` " - "instead.", - DeprecationWarning, stacklevel=2) - - return self.group_factory_for_discretization_tag(discretization_tag) - def group_factory_for_discretization_tag(self, discretization_tag): - """ - OK to override in user code to control mode/node choice. - """ if discretization_tag is None: discretization_tag = DISCR_TAG_BASE - return self.discr_tag_to_group_factory[discretization_tag] + return self._discr_tag_to_group_factory[discretization_tag] # }}} + # {{{ (internal) discretization getters + @memoize_method - def _discr_tag_volume_discr(self, discretization_tag): - assert discretization_tag is not None + def _volume_discr_from_dd(self, dd: DOFDesc) -> Discretization: + assert isinstance(dd.domain_tag, VolumeDomainTag) + + try: + base_volume_discr = self._volume_discrs[dd.domain_tag.tag] + except KeyError: + raise ValueError("a volume discretization with volume tag " + f"'{dd.domain_tag.tag}' is not known") # Refuse to re-make the volume discretization - if discretization_tag is DISCR_TAG_BASE: - return self._volume_discr + if dd.discretization_tag is DISCR_TAG_BASE: + return base_volume_discr from meshmode.discretization import Discretization return Discretization( - self._setup_actx, self._volume_discr.mesh, - self.group_factory_for_discretization_tag(discretization_tag) + self._setup_actx, base_volume_discr.mesh, + self.group_factory_for_discretization_tag(dd.discretization_tag) ) @memoize_method - def _modal_discr(self, domain_tag): + def _modal_discr(self, domain_tag) -> Discretization: from meshmode.discretization import Discretization discr_base = self.discr_from_dd(DOFDesc(domain_tag, DISCR_TAG_BASE)) @@ -557,10 +635,12 @@ def _modal_discr(self, domain_tag): self.group_factory_for_discretization_tag(DISCR_TAG_MODAL) ) + # }}} + # {{{ connection factories: modal<->nodal @memoize_method - def _modal_to_nodal_connection(self, to_dd): + def _modal_to_nodal_connection(self, to_dd: DOFDesc) -> DiscretizationConnection: """ :arg to_dd: a :class:`grudge.dof_desc.DOFDesc` describing the dofs corresponding to the @@ -575,7 +655,8 @@ def _modal_to_nodal_connection(self, to_dd): ) @memoize_method - def _nodal_to_modal_connection(self, from_dd): + def _nodal_to_modal_connection( + self, from_dd: DOFDesc) -> DiscretizationConnection: """ :arg from_dd: a :class:`grudge.dof_desc.DOFDesc` describing the dofs corresponding to the @@ -594,25 +675,31 @@ def _nodal_to_modal_connection(self, from_dd): # {{{ connection factories: boundary @memoize_method - def _boundary_connection(self, boundary_tag): + def _boundary_connection( + self, domain_tag: BoundaryDomainTag) -> DiscretizationConnection: return make_face_restriction( - self._setup_actx, - self._volume_discr, - self.group_factory_for_discretization_tag(DISCR_TAG_BASE), - boundary_tag=boundary_tag - ) + self._setup_actx, + self._volume_discr_from_dd( + DOFDesc(VolumeDomainTag(domain_tag.volume_tag), DISCR_TAG_BASE)), + self.group_factory_for_discretization_tag(DISCR_TAG_BASE), + boundary_tag=domain_tag.tag) # }}} - # {{{ connection factories: interior faces + # {{{ connection factories: faces @memoize_method - def _interior_faces_connection(self): + def _faces_connection( + self, domain_tag: BoundaryDomainTag) -> DiscretizationConnection: + assert domain_tag.tag in [FACE_RESTR_INTERIOR, FACE_RESTR_ALL] + return make_face_restriction( self._setup_actx, - self._volume_discr, + self._volume_discr_from_dd( + DOFDesc( + VolumeDomainTag(domain_tag.volume_tag), DISCR_TAG_BASE)), self.group_factory_for_discretization_tag(DISCR_TAG_BASE), - FACE_RESTR_INTERIOR, + domain_tag.tag, # FIXME: This will need to change as soon as we support # pyramids or other elements with non-identical face @@ -621,7 +708,8 @@ def _interior_faces_connection(self): ) @memoize_method - def opposite_face_connection(self): + def opposite_face_connection( + self, domain_tag: BoundaryDomainTag) -> DiscretizationConnection: """Provides a mapping from the base volume discretization to the exterior boundary restriction on a neighboring element. This does not take into account parallel partitions. @@ -629,93 +717,78 @@ def opposite_face_connection(self): from meshmode.discretization.connection import \ make_opposite_face_connection + assert domain_tag.tag is FACE_RESTR_INTERIOR + return make_opposite_face_connection( self._setup_actx, - self._interior_faces_connection()) + self._faces_connection(domain_tag)) # }}} - # {{{ connection factories: all-faces - - @memoize_method - def _all_faces_volume_connection(self): - return make_face_restriction( - self._setup_actx, - self._volume_discr, - self.group_factory_for_discretization_tag(DISCR_TAG_BASE), - FACE_RESTR_ALL, - - # FIXME: This will need to change as soon as we support - # pyramids or other elements with non-identical face - # types. - per_face_groups=False - ) - - # }}} + # {{{ properties @property - def dim(self): + def dim(self) -> int: """Return the topological dimension.""" - return self._volume_discr.dim + return single_valued(discr.dim for discr in self._volume_discrs.values()) @property - def ambient_dim(self): + def ambient_dim(self) -> int: """Return the dimension of the ambient space.""" - return self._volume_discr.ambient_dim + return single_valued( + discr.ambient_dim for discr in self._volume_discrs.values()) @property - def real_dtype(self): + def real_dtype(self) -> "np.dtype[Any]": """Return the data type used for real-valued arithmetic.""" - return self._volume_discr.real_dtype + return single_valued( + discr.real_dtype for discr in self._volume_discrs.values()) @property - def complex_dtype(self): + def complex_dtype(self) -> "np.dtype[Any]": """Return the data type used for complex-valued arithmetic.""" - return self._volume_discr.complex_dtype + return single_valued( + discr.complex_dtype for discr in self._volume_discrs.values()) - @property - def mesh(self): - """Return the :class:`meshmode.mesh.Mesh` over which the discretization - collection is built. - """ - return self._volume_discr.mesh + # }}} + + # {{{ array creators - def empty(self, array_context: ArrayContext, dtype=None): + def empty(self, array_context: ArrayContext, dtype=None, + *, dd: Optional[DOFDesc] = None) -> DOFArray: """Return an empty :class:`~meshmode.dof_array.DOFArray` defined at - the volume nodes: :class:`grudge.dof_desc.DD_VOLUME`. + the volume nodes: :class:`grudge.dof_desc.DD_VOLUME_ALL`. :arg array_context: an :class:`~arraycontext.context.ArrayContext`. :arg dtype: type special value 'c' will result in a vector of dtype :attr:`complex_dtype`. If *None* (the default), a real vector will be returned. """ - return self._volume_discr.empty(array_context, dtype) + if dd is None: + dd = DD_VOLUME_ALL + return self.discr_from_dd(dd).empty(array_context, dtype) - def zeros(self, array_context: ArrayContext, dtype=None): + def zeros(self, array_context: ArrayContext, dtype=None, + *, dd: Optional[DOFDesc] = None) -> DOFArray: """Return a zero-initialized :class:`~meshmode.dof_array.DOFArray` - defined at the volume nodes, :class:`grudge.dof_desc.DD_VOLUME`. + defined at the volume nodes, :class:`grudge.dof_desc.DD_VOLUME_ALL`. :arg array_context: an :class:`~arraycontext.context.ArrayContext`. :arg dtype: type special value 'c' will result in a vector of dtype :attr:`complex_dtype`. If *None* (the default), a real vector will be returned. """ - return self._volume_discr.zeros(array_context, dtype) + if dd is None: + dd = DD_VOLUME_ALL + + return self.discr_from_dd(dd).zeros(array_context, dtype) def is_volume_where(self, where): return where is None or as_dofdesc(where).is_volume() - @property - def order(self): - warn("DiscretizationCollection.order is deprecated, " - "consider using the orders of element groups instead. " - "'order' will go away in 2021.", - DeprecationWarning, stacklevel=2) - - from pytools import single_valued - return single_valued(egrp.order for egrp in self._volume_discr.groups) + # }}} - # {{{ Discretization-specific geometric properties + # {{{ discretization-specific geometric fields def nodes(self, dd=None): r"""Return the nodes of a discretization specified by *dd*. @@ -725,7 +798,7 @@ def nodes(self, dd=None): :returns: an object array of frozen :class:`~meshmode.dof_array.DOFArray`\ s """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL return self.discr_from_dd(dd).nodes() def normal(self, dd): @@ -741,14 +814,106 @@ def normal(self, dd): # }}} -class DGDiscretizationWithBoundaries(DiscretizationCollection): - def __init__(self, *args, **kwargs): - warn("DGDiscretizationWithBoundaries is deprecated and will go away " - "in 2022. Use DiscretizationCollection instead.", - DeprecationWarning, stacklevel=2) +# {{{ distributed/multi-volume setup + +def _set_up_inter_part_connections( + array_context: ArrayContext, + mpi_communicator: Optional["mpi4py.MPI.Intracomm"], + volume_discrs: Mapping[VolumeTag, Discretization], + base_group_factory: ElementGroupFactory, + ) -> Mapping[ + Tuple[PartID, PartID], + DiscretizationConnection]: + + from meshmode.distributed import (get_connected_parts, + make_remote_group_infos, InterRankBoundaryInfo, + MPIBoundaryCommSetupHelper) + + rank = mpi_communicator.Get_rank() if mpi_communicator is not None else None + + # Save boundary restrictions as they're created to avoid potentially creating + # them twice in the loop below + cached_part_bdry_restrictions: Mapping[ + Tuple[PartID, PartID], + DiscretizationConnection] = {} + + def get_part_bdry_restriction(self_part_id, other_part_id): + cached_result = cached_part_bdry_restrictions.get( + (self_part_id, other_part_id), None) + if cached_result is not None: + return cached_result + return cached_part_bdry_restrictions.setdefault( + (self_part_id, other_part_id), + make_face_restriction( + array_context, volume_discrs[self_part_id.volume_tag], + base_group_factory, + boundary_tag=BTAG_PARTITION(other_part_id))) + + inter_part_conns: Mapping[ + Tuple[PartID, PartID], + DiscretizationConnection] = {} + + irbis = [] + + for vtag, volume_discr in volume_discrs.items(): + part_id = PartID(vtag, rank) + connected_part_ids = get_connected_parts(volume_discr.mesh) + for connected_part_id in connected_part_ids: + bdry_restr = get_part_bdry_restriction( + self_part_id=part_id, other_part_id=connected_part_id) + + if connected_part_id.rank == rank: + # {{{ rank-local interface between multiple volumes + + connected_bdry_restr = get_part_bdry_restriction( + self_part_id=connected_part_id, other_part_id=part_id) + + from meshmode.discretization.connection import \ + make_partition_connection + inter_part_conns[connected_part_id, part_id] = \ + make_partition_connection( + array_context, + local_bdry_conn=bdry_restr, + remote_bdry_discr=connected_bdry_restr.to_discr, + remote_group_infos=make_remote_group_infos( + array_context, part_id, connected_bdry_restr)) + + # }}} + else: + # {{{ cross-rank interface + + if mpi_communicator is None: + raise RuntimeError("must supply an MPI communicator " + "when using a distributed mesh") + + irbis.append( + InterRankBoundaryInfo( + local_part_id=part_id, + remote_part_id=connected_part_id, + remote_rank=connected_part_id.rank, + local_boundary_connection=bdry_restr)) + + # }}} + + if irbis: + assert mpi_communicator is not None + + with MPIBoundaryCommSetupHelper(mpi_communicator, array_context, + irbis, base_group_factory) as bdry_setup_helper: + while True: + conns = bdry_setup_helper.complete_some() + if not conns: + # We're done. + break + + inter_part_conns.update(conns) - super().__init__(*args, **kwargs) + return inter_part_conns +# }}} + + +# {{{ modal group factory def _generate_modal_group_factory(nodal_group_factory): from meshmode.discretization.poly_element import ( @@ -769,4 +934,101 @@ def _generate_modal_group_factory(nodal_group_factory): f"Unknown mesh element group: {mesh_group_cls}" ) +# }}} + + +# {{{ make_discretization_collection + +MeshOrDiscr = Union[Mesh, Discretization] + + +def make_discretization_collection( + array_context: ArrayContext, + volumes: Union[ + MeshOrDiscr, + Mapping[VolumeTag, MeshOrDiscr]], + order: Optional[int] = None, + discr_tag_to_group_factory: Optional[ + Mapping[DiscretizationTag, ElementGroupFactory]] = None, + _result_type: type = DiscretizationCollection + ) -> DiscretizationCollection: + """ + :arg discr_tag_to_group_factory: A mapping from discretization tags + (typically one of: :class:`~grudge.dof_desc.DISCR_TAG_BASE`, + :class:`~grudge.dof_desc.DISCR_TAG_MODAL`, or + :class:`~grudge.dof_desc.DISCR_TAG_QUAD`) to a + :class:`~meshmode.discretization.ElementGroupFactory` + indicating with which type of discretization the operations are + to be carried out, or *None* to indicate that operations with this + discretization tag should be carried out with the standard volume + discretization. + + .. note:: + + If passing a :class:`~meshmode.discretization.Discretization` in + *volumes*, it must be nodal and unisolvent, consistent with + :class:`~grudge.dof_desc.DISCR_TAG_BASE`. + + .. note:: + + To use the resulting :class:`DiscretizationCollection` in a + distributed-memory manner, the *array_context* passed in + must be one of the distributed-memory array contexts + from :mod:`grudge.array_context`. Unlike the (now-deprecated, + for direct use) constructor of :class:`DiscretizationCollection`, + this function no longer accepts a separate MPI communicator. + + .. note:: + + If the resulting :class:`DiscretizationCollection` is distributed + across multiple ranks, then this is an MPI-collective operation, + i.e. all ranks in the communicator must enter this function at the same + time. + """ + + if isinstance(volumes, (Mesh, Discretization)): + volumes = {VTAG_ALL: volumes} + + from pytools import single_valued, is_single_valued + + assert len(volumes) > 0 + assert is_single_valued(mesh_or_discr.ambient_dim + for mesh_or_discr in volumes.values()) + + discr_tag_to_group_factory = _normalize_discr_tag_to_group_factory( + dim=single_valued( + mesh_or_discr.dim for mesh_or_discr in volumes.values()), + discr_tag_to_group_factory=discr_tag_to_group_factory, + order=order) + + del order + + mpi_communicator = getattr(array_context, "mpi_communicator", None) + + if any( + isinstance(mesh_or_discr, Discretization) + for mesh_or_discr in volumes.values()): + raise NotImplementedError("Doesn't work at the moment") + + volume_discrs = { + vtag: Discretization( + array_context, + _normalize_mesh_part_ids( + mesh, volumes.keys(), mpi_communicator=mpi_communicator), + discr_tag_to_group_factory[DISCR_TAG_BASE]) + for vtag, mesh in volumes.items()} + + return _result_type( + array_context=array_context, + volume_discrs=volume_discrs, + discr_tag_to_group_factory=discr_tag_to_group_factory, + inter_part_connections=_set_up_inter_part_connections( + array_context=array_context, + mpi_communicator=mpi_communicator, + volume_discrs=volume_discrs, + base_group_factory=discr_tag_to_group_factory[DISCR_TAG_BASE])) + +# }}} + + # vim: foldmethod=marker diff --git a/grudge/dof_desc.py b/grudge/dof_desc.py index 267e4f56e..cf285a30e 100644 --- a/grudge/dof_desc.py +++ b/grudge/dof_desc.py @@ -1,4 +1,55 @@ -"""Degree of freedom (DOF) descriptions""" +""" +Volume tags +----------- + +.. autoclass:: VolumeTag +.. autoclass:: VTAG_ALL + +:mod:`grudge`-specific boundary tags +------------------------------------ + +Domain tags +----------- + +A domain tag identifies a geometric part (or whole) of the domain described +by a :class:`grudge.DiscretizationCollection`. This can be a volume or a boundary. + +.. autoclass:: DTAG_SCALAR +.. autoclass:: DTAG_VOLUME_ALL +.. autoclass:: VolumeDomainTag +.. autoclass:: BoundaryDomainTag + +Discretization tags +------------------- + +A discretization tag serves as a symbolic identifier of the manner in which +meaning is assigned to degrees of freedom. + +.. autoclass:: DISCR_TAG_BASE +.. autoclass:: DISCR_TAG_QUAD +.. autoclass:: DISCR_TAG_MODAL + +DOF Descriptor +-------------- + +.. autoclass:: DOFDesc +.. autofunction:: as_dofdesc + +Shortcuts +--------- + +.. data:: DD_SCALAR +.. data:: DD_VOLUME_ALL +.. data:: DD_VOLUME_ALL_MODAL + +Internal things that are visble due to type annotations +------------------------------------------------------- + +.. class:: _DiscretizationTag +.. class:: ConvertibleToDOFDesc + + Anything that is convertible to a :class:`DOFDesc` via :func:`as_dofdesc`. +""" __copyright__ = """ Copyright (C) 2008 Andreas Kloeckner @@ -25,31 +76,18 @@ THE SOFTWARE. """ -from meshmode.discretization.connection import \ - FACE_RESTR_INTERIOR, FACE_RESTR_ALL -from meshmode.mesh import \ - BTAG_PARTITION, BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE -from warnings import warn import sys +from warnings import warn +from typing import Hashable, Union, Type, Optional, Any, Tuple +from dataclasses import dataclass, replace +from meshmode.discretization.connection import ( + FACE_RESTR_INTERIOR, FACE_RESTR_ALL) +from meshmode.mesh import ( + BTAG_PARTITION, BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE, BoundaryTag) -__doc__ = """ -.. autoclass:: DTAG_SCALAR -.. autoclass:: DTAG_VOLUME_ALL -.. autoclass:: DTAG_BOUNDARY - -.. autoclass:: DISCR_TAG_BASE -.. autoclass:: DISCR_TAG_QUAD -.. autoclass:: DISCR_TAG_MODAL - -.. autoclass:: DOFDesc -.. autofunction:: as_dofdesc - -.. data:: DD_SCALAR -.. data:: DD_VOLUME -.. data:: DD_VOLUME_MODAL -""" +# {{{ _to_identifier def _to_identifier(name: str) -> str: if not name.isidentifier(): @@ -57,71 +95,91 @@ def _to_identifier(name: str) -> str: else: return name +# }}} + + +# {{{ volume tags -# {{{ DOF description +class VTAG_ALL: # noqa: N801 + pass -class DTAG_SCALAR: # noqa: N801 + +VolumeTag = Hashable + +# }}} + + +# {{{ domain tag + +@dataclass(frozen=True, eq=True) +class ScalarDomainTag: # noqa: N801 """A domain tag denoting scalar values.""" -class DTAG_VOLUME_ALL: # noqa: N801 - """ - A domain tag denoting values defined - in all cell volumes. +DTAG_SCALAR = ScalarDomainTag() + + +@dataclass(frozen=True, eq=True, init=True) +class VolumeDomainTag: + """A domain tag referring to a volume identified by the + volume tag :attr:`tag`. These volume identifiers are only used + when the :class:`~grudge.discretization.DiscretizationCollection` contains + more than one volume. + + .. attribute:: tag + + .. automethod:: __init__ """ + tag: VolumeTag -class DTAG_BOUNDARY: # noqa: N801 - """A domain tag describing the values on element - boundaries which are adjacent to elements - of another :class:`~meshmode.mesh.Mesh`. +DTAG_VOLUME_ALL = VolumeDomainTag(VTAG_ALL) + + +@dataclass(frozen=True, eq=True, init=True) +class BoundaryDomainTag: + """A domain tag referring to a boundary identified by the + boundary tag :attr:`tag`. .. attribute:: tag + .. attribute:: volume_tag .. automethod:: __init__ - .. automethod:: __eq__ - .. automethod:: __ne__ - .. automethod:: __hash__ """ + tag: BoundaryTag + volume_tag: VolumeTag = VTAG_ALL - def __init__(self, tag): - """ - :arg tag: One of the following: - :class:`~meshmode.mesh.BTAG_ALL`, - :class:`~meshmode.mesh.BTAG_NONE`, - :class:`~meshmode.mesh.BTAG_REALLY_ALL`, - :class:`~meshmode.mesh.BTAG_PARTITION`. - """ - self.tag = tag - def __eq__(self, other): - return isinstance(other, DTAG_BOUNDARY) and self.tag == other.tag +DomainTag = Union[ScalarDomainTag, VolumeDomainTag, BoundaryDomainTag] - def __ne__(self, other): - return not self.__eq__(other) +# }}} + + +# {{{ discretization tag + +class _DiscretizationTag: # noqa: N801 + pass - def __hash__(self): - return hash(type(self)) ^ hash(self.tag) - def __repr__(self): - return "<{}({})>".format(type(self).__name__, repr(self.tag)) +DiscretizationTag = Type[_DiscretizationTag] -class DISCR_TAG_BASE: # noqa: N801 +class DISCR_TAG_BASE(_DiscretizationTag): # noqa: N801 """A discretization tag indicating the use of a - basic discretization grid. This tag is used + nodal and unisolvent discretization. This tag is used to distinguish the base discretization from quadrature (e.g. overintegration) or modal (:class:`DISCR_TAG_MODAL`) discretizations. """ -class DISCR_TAG_QUAD: # noqa: N801 - """A discretization tag indicating the use of a - quadrature discretization grid. This tag is used - to distinguish the quadrature discretization - (e.g. overintegration) from modal (:class:`DISCR_TAG_MODAL`) - or base (:class:`DISCR_TAG_BASE`) discretizations. +class DISCR_TAG_QUAD(_DiscretizationTag): # noqa: N801 + """A discretization tag indicating the use of a quadrature discretization + grid, which typically affords higher quadrature accuracy (e.g. for + nonlinear terms) at the expense of unisolvency. This tag is used to + distinguish the quadrature discretization (e.g. overintegration) from modal + (:class:`DISCR_TAG_MODAL`) or base (:class:`DISCR_TAG_BASE`) + discretizations. For working with multiple quadrature grids, it is recommended to create appropriate subclasses of @@ -135,20 +193,22 @@ class CustomQuadTag(DISCR_TAG_QUAD): "A custom quadrature discretization tag." dd = DOFDesc(DTAG_VOLUME_ALL, CustomQuadTag) - """ -class DISCR_TAG_MODAL: # noqa: N801 - """A discretization tag indicating the use of a - basic discretization grid with modal degrees of - freedom. This tag is used to distinguish the - modal discretization from the base (nodal) - discretization (e.g. :class:`DISCR_TAG_BASE`) or +class DISCR_TAG_MODAL(_DiscretizationTag): # noqa: N801 + """A discretization tag indicating the use of unisolvent modal degrees of + freedom. This tag is used to distinguish the modal discretization from the + base (nodal) discretization (e.g. :class:`DISCR_TAG_BASE`) or discretizations on quadrature grids (:class:`DISCR_TAG_QUAD`). """ +# }}} + + +# {{{ DOF descriptor +@dataclass(frozen=True, eq=True) class DOFDesc: """Describes the meaning of degrees of freedom. @@ -165,8 +225,9 @@ class DOFDesc: .. automethod:: uses_quadrature + .. automethod:: with_domain_tag .. automethod:: with_discr_tag - .. automethod:: with_dtag + .. automethod:: trace .. automethod:: __eq__ .. automethod:: __ne__ @@ -174,159 +235,87 @@ class DOFDesc: .. automethod:: as_identifier """ - def __init__(self, domain_tag, discretization_tag=None, - # FIXME: `quadrature_tag` is deprecated - quadrature_tag=None): - """ - :arg domain_tag: One of the following: - :class:`DTAG_SCALAR` (or the string ``"scalar"``), - :class:`DTAG_VOLUME_ALL` (or the string ``"vol"``) - for the default volume discretization, - :data:`~meshmode.discretization.connection.FACE_RESTR_ALL` - (or the string ``"all_faces"``), or - :data:`~meshmode.discretization.connection.FACE_RESTR_INTERIOR` - (or the string ``"int_faces"``), or one of - :class:`~meshmode.mesh.BTAG_ALL`, - :class:`~meshmode.mesh.BTAG_NONE`, - :class:`~meshmode.mesh.BTAG_REALLY_ALL`, - :class:`~meshmode.mesh.BTAG_PARTITION`, - or *None* to indicate that the geometry is not yet known. - - :arg discretization_tag: - *None* or :class:`DISCR_TAG_BASE` to indicate the use of the basic - discretization grid, :class:`DISCR_TAG_MODAL` to indicate a - modal discretization, or :class:`DISCR_TAG_QUAD` to indicate - the use of a quadrature grid. - """ + domain_tag: DomainTag + discretization_tag: DiscretizationTag - if domain_tag is None: - pass - elif domain_tag in [DTAG_SCALAR, "scalar"]: - domain_tag = DTAG_SCALAR - elif domain_tag in [DTAG_VOLUME_ALL, "vol"]: - domain_tag = DTAG_VOLUME_ALL - elif domain_tag in [FACE_RESTR_ALL, "all_faces"]: - domain_tag = FACE_RESTR_ALL - elif domain_tag in [FACE_RESTR_INTERIOR, "int_faces"]: - domain_tag = FACE_RESTR_INTERIOR - elif isinstance(domain_tag, BTAG_PARTITION): - domain_tag = DTAG_BOUNDARY(domain_tag) - elif domain_tag in [BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE]: - domain_tag = DTAG_BOUNDARY(domain_tag) - elif isinstance(domain_tag, DTAG_BOUNDARY): - pass - else: - raise ValueError("domain tag not understood: %s" % domain_tag) + def __init__(self, domain_tag: Any, + discretization_tag: Optional[type[DiscretizationTag]] = None): - if (quadrature_tag is not None and discretization_tag is not None): - raise ValueError( - "Both `quadrature_tag` and `discretization_tag` are specified. " - "Use `discretization_tag` instead." - ) - - # FIXME: `quadrature_tag` is deprecated - if (quadrature_tag is not None and discretization_tag is None): - warn("`quadrature_tag` is a deprecated kwarg and will be dropped " - "in version 2022.x. Use `discretization_tag` instead.", - DeprecationWarning, stacklevel=2) - discretization_tag = quadrature_tag + if ( + not (isinstance(domain_tag, + (ScalarDomainTag, BoundaryDomainTag, VolumeDomainTag))) + or discretization_tag is None + or ( + not isinstance(discretization_tag, type) + or not issubclass(discretization_tag, _DiscretizationTag))): + warn("Sloppy construction of DOFDesc is deprecated. " + "This will stop working in 2023. " + "Call as_dofdesc instead, with the same arguments. ", + DeprecationWarning, stacklevel=2) - if domain_tag is DTAG_SCALAR and discretization_tag is not None: - raise ValueError("cannot have nontrivial discretization tag on scalar") + domain_tag, discretization_tag = _normalize_domain_and_discr_tag( + domain_tag, discretization_tag) - if discretization_tag is None: - discretization_tag = DISCR_TAG_BASE - - # FIXME: String tags are deprecated - if isinstance(discretization_tag, str): - warn("Support for string values of `discretization_tag` will " - "be dropped in version 2022.x. Use one of the `DISCR_TAG_` " - "tags instead.", - DeprecationWarning, stacklevel=2) - - self.domain_tag = domain_tag - self.discretization_tag = discretization_tag - - @property - def quadrature_tag(self): - warn("`DOFDesc.quadrature_tag` is deprecated and will be dropped " - "in version 2022.x. Use `DOFDesc.discretization_tag` instead.", - DeprecationWarning, stacklevel=2) - return self.discretization_tag + object.__setattr__(self, "domain_tag", domain_tag) + object.__setattr__(self, "discretization_tag", discretization_tag) - def is_scalar(self): - return self.domain_tag is DTAG_SCALAR + def is_scalar(self) -> bool: + return isinstance(self.domain_tag, ScalarDomainTag) - def is_discretized(self): + def is_discretized(self) -> bool: return not self.is_scalar() - def is_volume(self): - return self.domain_tag is DTAG_VOLUME_ALL + def is_volume(self) -> bool: + return isinstance(self.domain_tag, VolumeDomainTag) - def is_boundary_or_partition_interface(self): - return isinstance(self.domain_tag, DTAG_BOUNDARY) - - def is_trace(self): - return (self.is_boundary_or_partition_interface() - or self.domain_tag in [ + def is_boundary_or_partition_interface(self) -> bool: + return (isinstance(self.domain_tag, BoundaryDomainTag) + and self.domain_tag.tag not in [ FACE_RESTR_ALL, FACE_RESTR_INTERIOR]) - def uses_quadrature(self): + def is_trace(self) -> bool: + return isinstance(self.domain_tag, BoundaryDomainTag) + + def uses_quadrature(self) -> bool: # FIXME: String tags are deprecated - # Check for string first, otherwise - # `issubclass` will raise an exception whenever - # its first argument is not a class. - # This can go away once support for strings is dropped - # completely. if isinstance(self.discretization_tag, str): # All strings are interpreted as quadrature-related tags return True - elif issubclass(self.discretization_tag, DISCR_TAG_QUAD): - return True - elif issubclass(self.discretization_tag, - (DISCR_TAG_BASE, DISCR_TAG_MODAL)): - return False - else: - raise ValueError( - f"Unsure how to interpret tag: {self.discretization_tag}" - ) - - def with_qtag(self, discr_tag): - warn("`DOFDesc.with_qtag` is deprecated and will be dropped " - "in version 2022.x. Use `DOFDesc.with_discr_tag` instead.", - DeprecationWarning, stacklevel=2) - return self.with_discr_tag(discr_tag) - - def with_discr_tag(self, discr_tag): - return type(self)(domain_tag=self.domain_tag, - discretization_tag=discr_tag) - - def with_dtag(self, dtag): - return type(self)(domain_tag=dtag, - discretization_tag=self.discretization_tag) - - def __eq__(self, other): - return (type(self) == type(other) - and self.domain_tag == other.domain_tag - and self.discretization_tag == other.discretization_tag) - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - return hash((type(self), self.domain_tag, self.discretization_tag)) - - def __repr__(self): - def fmt(s): - if isinstance(s, type): - return s.__name__ - else: - return repr(s) + elif isinstance(self.discretization_tag, type): + if issubclass(self.discretization_tag, DISCR_TAG_QUAD): + return True + elif issubclass(self.discretization_tag, + (DISCR_TAG_BASE, DISCR_TAG_MODAL)): + return False + + raise ValueError( + f"Invalid discretization tag: {self.discretization_tag}") + + def with_dtag(self, dtag) -> "DOFDesc": + from warnings import warn + warn("'with_dtag' is deprecated. Use 'with_domain_tag' instead. " + "This will stop working in 2023", + DeprecationWarning, stacklevel=2) + return replace(self, domain_tag=dtag) + + def with_domain_tag(self, dtag) -> "DOFDesc": + return replace(self, domain_tag=dtag) + + def trace(self, btag: BoundaryTag) -> "DOFDesc": + """Return a :class:`DOFDesc` for the restriction of the volume + descriptor *self* to the boundary named by *btag*. + + An error is raised if this method is called on a non-volume instance of + :class:`DOFDesc`. + """ + if not isinstance(self.domain_tag, VolumeDomainTag): + raise ValueError(f"must originate on volume, got '{self.domain_tag}'") + return replace(self, + domain_tag=BoundaryDomainTag(btag, volume_tag=self.domain_tag.tag)) - return "DOFDesc({}, {})".format( - fmt(self.domain_tag), - fmt(self.discretization_tag)) + def with_discr_tag(self, discr_tag) -> "DOFDesc": + return replace(self, discretization_tag=discr_tag) def as_identifier(self) -> str: """Returns a descriptive string for this :class:`DOFDesc` that is usable @@ -341,7 +330,16 @@ def as_identifier(self) -> str: dom_id = "f_all" elif self.domain_tag is FACE_RESTR_INTERIOR: dom_id = "f_int" - elif isinstance(self.domain_tag, DTAG_BOUNDARY): + elif isinstance(self.domain_tag, VolumeDomainTag): + vtag = self.domain_tag.tag + if isinstance(vtag, type): + vtag = vtag.__name__.replace("VTAG_", "").lower() + elif isinstance(vtag, str): + vtag = _to_identifier(vtag) + else: + vtag = _to_identifier(str(vtag)) + dom_id = f"v_{vtag}" + elif isinstance(self.domain_tag, BoundaryDomainTag): btag = self.domain_tag.tag if isinstance(btag, type): btag = btag.__name__.replace("BTAG_", "").lower() @@ -369,31 +367,101 @@ def as_identifier(self) -> str: return f"{dom_id}{discr_id}" -DD_SCALAR = DOFDesc(DTAG_SCALAR, None) +DD_SCALAR = DOFDesc(DTAG_SCALAR, DISCR_TAG_BASE) +DD_VOLUME_ALL = DOFDesc(DTAG_VOLUME_ALL, DISCR_TAG_BASE) +DD_VOLUME_ALL_MODAL = DOFDesc(DTAG_VOLUME_ALL, DISCR_TAG_MODAL) + + +def _normalize_domain_and_discr_tag( + domain: Any, + discretization_tag: Optional[DiscretizationTag] = None, + *, _contextual_volume_tag: Optional[VolumeTag] = None + ) -> Tuple[DomainTag, DiscretizationTag]: + + if _contextual_volume_tag is None: + _contextual_volume_tag = VTAG_ALL + + if domain == "scalar": + domain = DTAG_SCALAR + elif isinstance(domain, (ScalarDomainTag, BoundaryDomainTag, VolumeDomainTag)): + pass + elif domain in [VTAG_ALL, "vol"]: + domain = DTAG_VOLUME_ALL + elif domain in [FACE_RESTR_ALL, "all_faces"]: + domain = BoundaryDomainTag(FACE_RESTR_ALL, _contextual_volume_tag) + elif domain in [FACE_RESTR_INTERIOR, "int_faces"]: + domain = BoundaryDomainTag(FACE_RESTR_INTERIOR, _contextual_volume_tag) + elif isinstance(domain, BTAG_PARTITION): + domain = BoundaryDomainTag(domain, _contextual_volume_tag) + elif domain in [BTAG_ALL, BTAG_REALLY_ALL, BTAG_NONE]: + domain = BoundaryDomainTag(domain, _contextual_volume_tag) + else: + raise ValueError("domain tag not understood: %s" % domain) + + if domain is DTAG_SCALAR and discretization_tag is not None: + raise ValueError("cannot have nontrivial discretization tag on scalar") + + if discretization_tag is None: + discretization_tag = DISCR_TAG_BASE -DD_VOLUME = DOFDesc(DTAG_VOLUME_ALL, None) + return domain, discretization_tag + + +ConvertibleToDOFDesc = Any + + +def as_dofdesc( + domain: "ConvertibleToDOFDesc", + discretization_tag: Optional[DiscretizationTag] = None, + *, _contextual_volume_tag: Optional[VolumeTag] = None) -> DOFDesc: + """ + :arg domain_tag: One of the following: + :class:`DTAG_SCALAR` (or the string ``"scalar"``), + :class:`DTAG_VOLUME_ALL` (or the string ``"vol"``) + for the default volume discretization, + :data:`~meshmode.discretization.connection.FACE_RESTR_ALL` + (or the string ``"all_faces"``), or + :data:`~meshmode.discretization.connection.FACE_RESTR_INTERIOR` + (or the string ``"int_faces"``), or one of + :class:`~meshmode.mesh.BTAG_ALL`, + :class:`~meshmode.mesh.BTAG_NONE`, + :class:`~meshmode.mesh.BTAG_REALLY_ALL`, + :class:`~meshmode.mesh.BTAG_PARTITION`, + or *None* to indicate that the geometry is not yet known. + + :arg discretization_tag: + *None* or :class:`DISCR_TAG_BASE` to indicate the use of the basic + discretization grid, :class:`DISCR_TAG_MODAL` to indicate a + modal discretization, or :class:`DISCR_TAG_QUAD` to indicate + the use of a quadrature grid. + """ -DD_VOLUME_MODAL = DOFDesc(DTAG_VOLUME_ALL, DISCR_TAG_MODAL) + if isinstance(domain, DOFDesc): + return domain + domain, discretization_tag = _normalize_domain_and_discr_tag( + domain, discretization_tag, + _contextual_volume_tag=_contextual_volume_tag) -def as_dofdesc(dd): - if isinstance(dd, DOFDesc): - return dd - return DOFDesc(dd, discretization_tag=None) + return DOFDesc(domain, discretization_tag) # }}} -# {{{ Deprecated tags +# {{{ deprecations -_deprecated_name_to_new_name = {"QTAG_NONE": "DISCR_TAG_BASE", - "QTAG_MODAL": "DISCR_TAG_MODAL"} +_deprecated_name_to_new_name = { + "DTAG_VOLUME": "VolumeDomainTag", + "DTAG_BOUNDARY": "BoundaryDomainTag", + "DD_VOLUME": "DD_VOLUME_ALL", + "DD_VOLUME_MODAL": "DD_VOLUME_ALL_MODAL" + } def __getattr__(name): if name in _deprecated_name_to_new_name: warn(f"'{name}' is deprecated and will be dropped " - f"in version 2022.x. Use '{_deprecated_name_to_new_name[name]}' " + f"in version 2023.x. Use '{_deprecated_name_to_new_name[name]}' " "instead.", DeprecationWarning, stacklevel=2) return globals()[_deprecated_name_to_new_name[name]] diff --git a/grudge/dt_utils.py b/grudge/dt_utils.py index 73f307b38..c45701e01 100644 --- a/grudge/dt_utils.py +++ b/grudge/dt_utils.py @@ -43,7 +43,9 @@ """ +from typing import Optional, Sequence import numpy as np +import loopy as lp from arraycontext import ArrayContext, Scalar, tag_axes from arraycontext.metadata import NameHint @@ -52,8 +54,10 @@ DiscretizationFaceAxisTag, DiscretizationElementAxisTag) -from grudge.dof_desc import DD_VOLUME, DOFDesc, as_dofdesc +from grudge.dof_desc import ( + DD_VOLUME_ALL, DOFDesc, as_dofdesc, BoundaryDomainTag, FACE_RESTR_ALL) from grudge.discretization import DiscretizationCollection +from grudge.grudge_tags import KernelDataTag, ParameterValue, IsFaceDOFArray, IsDOFArray import grudge.op as op @@ -63,7 +67,8 @@ def characteristic_lengthscales( - actx: ArrayContext, dcoll: DiscretizationCollection) -> DOFArray: + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None) -> DOFArray: r"""Computes the characteristic length scale :math:`h_{\text{loc}}` at each node. The characteristic length scale is mainly useful for estimating the stable time step size. E.g. for a hyperbolic system, an estimate of the @@ -79,7 +84,7 @@ def characteristic_lengthscales( node distance on the reference cell (see :func:`dt_non_geometric_factors`), and :math:`r_D` is the inradius of the cell (see :func:`dt_geometric_factors`). - :returns: a frozen :class:`~meshmode.dof_array.DOFArray` containing a + :returns: a :class:`~meshmode.dof_array.DOFArray` containing a characteristic lengthscale for each element, at each nodal location. .. note:: @@ -91,7 +96,7 @@ def characteristic_lengthscales( methods has been used as a guide. Any concrete time integrator will likely require scaling of the values returned by this routine. """ - @memoize_in(dcoll, (characteristic_lengthscales, + @memoize_in(dcoll, (characteristic_lengthscales, dd, "compute_characteristic_lengthscales")) def _compute_characteristic_lengthscales(): return actx.freeze( @@ -103,15 +108,16 @@ def _compute_characteristic_lengthscales(): # corresponding group non-geometric factor cng * geo_facts for cng, geo_facts in zip( - dt_non_geometric_factors(dcoll), - actx.thaw(dt_geometric_factors(dcoll))))))) + dt_non_geometric_factors(dcoll, dd), + actx.thaw(dt_geometric_factors(dcoll, dd))))))) return actx.thaw(_compute_characteristic_lengthscales()) @memoize_on_first_arg def dt_non_geometric_factors( - dcoll: DiscretizationCollection, dd=None) -> list: + dcoll: DiscretizationCollection, dd: Optional[DOFDesc] = None + ) -> Sequence[float]: r"""Computes the non-geometric scale factors following [Hesthaven_2008]_, section 6.4, for each element group in the *dd* discretization: @@ -128,7 +134,7 @@ def dt_non_geometric_factors( node distance on the reference element for each group. """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL discr = dcoll.discr_from_dd(dd) min_delta_rs = [] @@ -160,7 +166,8 @@ def dt_non_geometric_factors( @memoize_on_first_arg def h_max_from_volume( - dcoll: DiscretizationCollection, dim=None, dd=None) -> Scalar: + dcoll: DiscretizationCollection, dim=None, + dd: Optional[DOFDesc] = None) -> Scalar: """Returns a (maximum) characteristic length based on the volume of the elements. This length may not be representative if the elements have very high aspect ratios. @@ -175,7 +182,7 @@ def h_max_from_volume( from grudge.reductions import nodal_max, elementwise_sum if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dd = as_dofdesc(dd) if dim is None: @@ -191,7 +198,8 @@ def h_max_from_volume( @memoize_on_first_arg def h_min_from_volume( - dcoll: DiscretizationCollection, dim=None, dd=None) -> Scalar: + dcoll: DiscretizationCollection, dim=None, + dd: Optional[DOFDesc] = None) -> Scalar: """Returns a (minimum) characteristic length based on the volume of the elements. This length may not be representative if the elements have very high aspect ratios. @@ -206,7 +214,7 @@ def h_min_from_volume( from grudge.reductions import nodal_min, elementwise_sum if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dd = as_dofdesc(dd) if dim is None: @@ -221,7 +229,7 @@ def h_min_from_volume( def dt_geometric_factors( - dcoll: DiscretizationCollection, dd=None) -> DOFArray: + dcoll: DiscretizationCollection, dd: Optional[DOFDesc] = None) -> DOFArray: r"""Computes a geometric scaling factor for each cell following [Hesthaven_2008]_, section 6.4, defined as the inradius (radius of an inscribed circle/sphere). @@ -244,7 +252,7 @@ def dt_geometric_factors( from meshmode.discretization.poly_element import SimplexElementGroupBase if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL actx = dcoll._setup_actx volm_discr = dcoll.discr_from_dd(dd) @@ -271,7 +279,8 @@ def dt_geometric_factors( # Inscribed "circle" radius is half the cell size return actx.freeze(cell_vols/2) - dd_face = DOFDesc("all_faces", dd.discretization_tag) + dd_face = dd.with_domain_tag( + BoundaryDomainTag(FACE_RESTR_ALL, dd.domain_tag.tag)) face_discr = dcoll.discr_from_dd(dd_face) # Compute areas of each face @@ -281,6 +290,62 @@ def dt_geometric_factors( ) ) + data = [] + + if actx.supports_nonscalar_broadcasting: + for vgrp, face_ae_i in zip(volm_discr.groups, face_areas): + + fp_format = face_ae_i.dtype + Ne = vgrp.nelements + Nf = vgrp.mesh_el_group.nfaces + Nj = face_ae_i.shape[-1]#afgrp.nunit_dofs + + kernel_data = [ + lp.GlobalArg("arg0", fp_format, strides=lp.auto, shape=(Nf, Ne, Nj), tags=[IsFaceDOFArray()]), + #lp.GlobalArg("out", fp_format, is_output=True), # Specifying causes wrong soln + lp.ValueArg("Nf", tags=[ParameterValue(Nf)]), + lp.ValueArg("Nj", tags=[ParameterValue(Nj)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + ... + ] + kd_tag = KernelDataTag(kernel_data) + + data.append(actx.einsum("fej->e", + tag_axes(actx, { + 0: DiscretizationFaceAxisTag(), + 1: DiscretizationElementAxisTag(), + 2: DiscretizationDOFAxisTag() + }, + #face_ae_i.reshape(Nf, Ne, face_ae_i.shape[-1])), + face_ae_i.reshape(Nf, Ne, Nj)), + tagged=(FirstAxisIsElementsTag(),kd_tag))) + else: + + for vgrp, afgrp, face_ae_i in zip(volm_discr.groups, face_discr.groups, face_areas): + fp_format = face_ae_i.dtype + Ne = vgrp.nelements + Nf = vgrp.mesh_el_group.nfaces + Nj = face_ae_i.shape[-1]#afgrp.nunit_dofs + + kernel_data = [ + lp.GlobalArg("arg0", fp_format, strides=lp.auto, shape=(Nf, Ne, Nj), tags=[IsFaceDOFArray()]), + #lp.GlobalArg("out", fp_format, is_output=True), # Specifying causes wrong soln + lp.ValueArg("Nf", tags=[ParameterValue(Nf)]), + lp.ValueArg("Nj", tags=[ParameterValue(Nj)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + ... + ] + kd_tag = KernelDataTag(kernel_data) + + + data.append(actx.einsum("fej->e", + #face_ae_i.reshape(Nf, Ne, face_ae_i.shape[-1]), + face_ae_i.reshape(Nf, Ne, Nj), + tagged=(FirstAxisIsElementsTag(),kd_tag)) / afgrp.nunit_dofs) + + surface_areas = DOFArray(actx, data=tuple(data)) + + """ if actx.supports_nonscalar_broadcasting: # Compute total surface area of an element by summing over the # individual face areas @@ -325,14 +390,30 @@ def dt_geometric_factors( face_areas) ) ) + """ - return actx.freeze( - actx.tag(NameHint(f"dt_geometric_{dd.as_identifier()}"), - DOFArray(actx, - data=tuple( - actx.einsum("e,ei->ei", 1/sae_i, cv_i, - tagged=(FirstAxisIsElementsTag(),)) * dcoll.dim - for cv_i, sae_i in zip(cell_vols, surface_areas))))) + data = [] + for cv_i, sae_i, in zip(cell_vols, surface_areas): + + fp_format = cv_i.dtype + Ne, Ni = cv_i.shape + + kernel_data = [ + lp.GlobalArg("arg0", sae_i.dtype, shape=(Ne,), strides=lp.auto), + lp.GlobalArg("arg1", fp_format, shape=(Ne, Ni), tags=[IsDOFArray()]), + lp.GlobalArg("out", fp_format, shape=(Ne, Ni), tags=[IsDOFArray()], is_output=True), + lp.ValueArg("Ni", tags=[ParameterValue(Ni)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + ... + ] + kd_tag = KernelDataTag(kernel_data) + + data.append(actx.einsum("e,ei->ei", + 1/sae_i, + cv_i, + tagged=(FirstAxisIsElementsTag(),kd_tag)) * dcoll.dim) + + return actx.freeze(actx.tag(NameHint(f"dt_geometric_{dd.as_identifier()}"),DOFArray(actx, data=tuple(data)))) # }}} diff --git a/grudge/eager.py b/grudge/eager.py index 1886cfd04..2175592d4 100644 --- a/grudge/eager.py +++ b/grudge/eager.py @@ -47,14 +47,14 @@ def __init__(self, *args, **kwargs): def project(self, src, tgt, vec): return op.project(self, src, tgt, vec) - def grad(self, vec): - return op.local_grad(self, vec) + def grad(self, *args): + return op.local_grad(self, *args) - def d_dx(self, xyz_axis, vec): - return op.local_d_dx(self, xyz_axis, vec) + def d_dx(self, xyz_axis, *args): + return op.local_d_dx(self, xyz_axis, *args) - def div(self, vecs): - return op.local_div(self, vecs) + def div(self, *args): + return op.local_div(self, *args) def weak_grad(self, *args): return op.weak_local_grad(self, *args) @@ -68,8 +68,8 @@ def weak_div(self, *args): def mass(self, *args): return op.mass(self, *args) - def inverse_mass(self, vec): - return op.inverse_mass(self, vec) + def inverse_mass(self, *args): + return op.inverse_mass(self, *args) def face_mass(self, *args): return op.face_mass(self, *args) @@ -87,8 +87,8 @@ def nodal_max(self, dd, vec): return op.nodal_max(self, dd, vec) -connected_ranks = op.connected_ranks interior_trace_pair = op.interior_trace_pair cross_rank_trace_pairs = op.cross_rank_trace_pairs +inter_volume_trace_pairs = op.inter_volume_trace_pairs # vim: foldmethod=marker diff --git a/grudge/geometry/metrics.py b/grudge/geometry/metrics.py index 89e1f1f2c..f5c622cd5 100644 --- a/grudge/geometry/metrics.py +++ b/grudge/geometry/metrics.py @@ -58,6 +58,7 @@ """ +from typing import Optional, Tuple, Union import numpy as np from arraycontext import ArrayContext, tag_axes @@ -68,7 +69,7 @@ import grudge.dof_desc as dof_desc from grudge.dof_desc import ( - DD_VOLUME, DOFDesc, DISCR_TAG_BASE + DD_VOLUME_ALL, DOFDesc, DISCR_TAG_BASE ) from meshmode.transform_metadata import (DiscretizationAmbientDimAxisTag, @@ -115,7 +116,8 @@ def to_quad(vec): def forward_metric_nth_derivative( actx: ArrayContext, dcoll: DiscretizationCollection, - xyz_axis, ref_axes, dd=None, + xyz_axis: int, ref_axes: Union[int, Tuple[Tuple[int, int], ...]], + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False) -> DOFArray: r"""Pointwise metric derivatives representing repeated derivatives of the physical coordinate enumerated by *xyz_axis*: :math:`x_{\mathrm{xyz\_axis}}` @@ -150,7 +152,7 @@ def forward_metric_nth_derivative( metric derivative at each nodal coordinate. """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL inner_dd = dd.with_discr_tag(DISCR_TAG_BASE) @@ -182,8 +184,10 @@ def forward_metric_nth_derivative( def forward_metric_derivative_vector( - actx: ArrayContext, dcoll: DiscretizationCollection, rst_axis, dd=None, - *, _use_geoderiv_connection=False) -> np.ndarray: + actx: ArrayContext, dcoll: DiscretizationCollection, + rst_axis: Union[int, Tuple[Tuple[int, int], ...]], + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False + ) -> np.ndarray: r"""Computes an object array containing the forward metric derivatives of each physical coordinate. @@ -207,7 +211,9 @@ def forward_metric_derivative_vector( def forward_metric_derivative_mv( - actx: ArrayContext, dcoll: DiscretizationCollection, rst_axis, dd=None, + actx: ArrayContext, dcoll: DiscretizationCollection, + rst_axis: Union[int, Tuple[Tuple[int, int], ...]], + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False) -> MultiVector: r"""Computes a :class:`pymbolic.geometric_algebra.MultiVector` containing the forward metric derivatives of each physical coordinate. @@ -230,7 +236,8 @@ def forward_metric_derivative_mv( def forward_metric_derivative_mat( - actx: ArrayContext, dcoll: DiscretizationCollection, dd=None, + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False) -> np.ndarray: r"""Computes the forward metric derivative matrix, also commonly called the Jacobian matrix, with entries defined as the @@ -257,7 +264,7 @@ def forward_metric_derivative_mat( ambient_dim = dcoll.ambient_dim if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dim = dcoll.discr_from_dd(dd).dim @@ -271,7 +278,8 @@ def forward_metric_derivative_mat( def first_fundamental_form(actx: ArrayContext, dcoll: DiscretizationCollection, - dd=None, *, _use_geoderiv_connection=False) -> np.ndarray: + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False + ) -> np.ndarray: r"""Computes the first fundamental form using the Jacobian matrix: .. math:: @@ -297,7 +305,7 @@ def first_fundamental_form(actx: ArrayContext, dcoll: DiscretizationCollection, form. """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL mder = forward_metric_derivative_mat( actx, dcoll, dd=dd, _use_geoderiv_connection=_use_geoderiv_connection) @@ -306,7 +314,8 @@ def first_fundamental_form(actx: ArrayContext, dcoll: DiscretizationCollection, def inverse_metric_derivative_mat( - actx: ArrayContext, dcoll: DiscretizationCollection, dd=None, + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False) -> np.ndarray: r"""Computes the inverse metric derivative matrix, which is the inverse of the Jacobian (forward metric derivative) matrix. @@ -320,7 +329,7 @@ def inverse_metric_derivative_mat( ambient_dim = dcoll.ambient_dim if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dim = dcoll.discr_from_dd(dd).dim @@ -336,7 +345,8 @@ def inverse_metric_derivative_mat( def inverse_first_fundamental_form( - actx: ArrayContext, dcoll: DiscretizationCollection, dd=None, + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False) -> np.ndarray: r"""Computes the inverse of the first fundamental form: @@ -360,7 +370,7 @@ def inverse_first_fundamental_form( first fundamental form. """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dim = dcoll.discr_from_dd(dd).dim @@ -387,7 +397,8 @@ def inverse_first_fundamental_form( def inverse_metric_derivative( - actx: ArrayContext, dcoll: DiscretizationCollection, rst_axis, xyz_axis, dd, + actx: ArrayContext, dcoll: DiscretizationCollection, + rst_axis: int, xyz_axis: int, dd: DOFDesc, *, _use_geoderiv_connection=False ) -> DOFArray: r"""Computes the inverse metric derivative of the physical @@ -446,7 +457,7 @@ def outprod_with_unit(i, at): def inverse_surface_metric_derivative( actx: ArrayContext, dcoll: DiscretizationCollection, - rst_axis, xyz_axis, dd=None, + rst_axis, xyz_axis, dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False): r"""Computes the inverse surface metric derivative of the physical coordinate enumerated by *xyz_axis* with respect to the @@ -468,7 +479,7 @@ def inverse_surface_metric_derivative( ambient_dim = dcoll.ambient_dim if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dd = dof_desc.as_dofdesc(dd) if ambient_dim == dim: @@ -488,7 +499,8 @@ def inverse_surface_metric_derivative( def inverse_surface_metric_derivative_mat( - actx: ArrayContext, dcoll: DiscretizationCollection, dd=None, + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None, *, times_area_element=False, _use_geoderiv_connection=False): r"""Computes the matrix of inverse surface metric derivatives, indexed by ``(xyz_axis, rst_axis)``. It returns all values of @@ -509,7 +521,7 @@ def inverse_surface_metric_derivative_mat( """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dd = dof_desc.as_dofdesc(dd) @memoize_in(dcoll, (inverse_surface_metric_derivative_mat, dd, @@ -542,7 +554,7 @@ def _inv_surf_metric_deriv(): def _signed_face_ones( - actx: ArrayContext, dcoll: DiscretizationCollection, dd + actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc ) -> DOFArray: assert dd.is_trace() @@ -550,7 +562,7 @@ def _signed_face_ones( # NOTE: ignore quadrature_tags on dd, since we only care about # the face_id here all_faces_conn = dcoll.connection_from_dds( - DD_VOLUME, DOFDesc(dd.domain_tag) + DD_VOLUME_ALL, DOFDesc(dd.domain_tag, DISCR_TAG_BASE) ) signed_ones = dcoll.discr_from_dd(dd.with_discr_tag(DISCR_TAG_BASE)).zeros( actx, dtype=dcoll.real_dtype @@ -571,7 +583,7 @@ def _signed_face_ones( def parametrization_derivative( - actx: ArrayContext, dcoll: DiscretizationCollection, dd, + actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc, *, _use_geoderiv_connection=False) -> MultiVector: r"""Computes the product of forward metric derivatives spanning the tangent space with topological dimension *dim*. @@ -584,7 +596,7 @@ def parametrization_derivative( the product of metric derivatives. """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dim = dcoll.discr_from_dd(dd).dim if dim == 0: @@ -605,8 +617,10 @@ def parametrization_derivative( ) -def pseudoscalar(actx: ArrayContext, dcoll: DiscretizationCollection, - dd=None, *, _use_geoderiv_connection=False) -> MultiVector: +def pseudoscalar( + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False + ) -> MultiVector: r"""Computes the field of pseudoscalars for the domain/discretization identified by *dd*. @@ -618,7 +632,7 @@ def pseudoscalar(actx: ArrayContext, dcoll: DiscretizationCollection, :class:`~meshmode.dof_array.DOFArray`\ s. """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL return parametrization_derivative( actx, dcoll, dd, @@ -626,7 +640,8 @@ def pseudoscalar(actx: ArrayContext, dcoll: DiscretizationCollection, def area_element( - actx: ArrayContext, dcoll: DiscretizationCollection, dd=None, + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False ) -> DOFArray: r"""Computes the scale factor used to transform integrals from reference @@ -642,7 +657,7 @@ def area_element( volumes for each element. """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL @memoize_in(dcoll, (area_element, dd, _use_geoderiv_connection)) def _area_elements(): @@ -662,7 +677,8 @@ def _area_elements(): # {{{ surface normal vectors def rel_mv_normal( - actx: ArrayContext, dcoll: DiscretizationCollection, dd=None, + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None, *, _use_geoderiv_connection=False) -> MultiVector: r"""Computes surface normals at each nodal location as a :class:`~pymbolic.geometric_algebra.MultiVector` relative to the @@ -688,7 +704,7 @@ def rel_mv_normal( def mv_normal( - actx: ArrayContext, dcoll: DiscretizationCollection, dd, + actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc, *, _use_geoderiv_connection=False ) -> MultiVector: r"""Exterior unit normal as a :class:`~pymbolic.geometric_algebra.MultiVector`. @@ -744,10 +760,10 @@ def _normal(): from grudge.op import project volm_normal = MultiVector( - project(dcoll, dof_desc.DD_VOLUME, dd, + project(dcoll, DD_VOLUME_ALL, dd, rel_mv_normal( actx, dcoll, - dd=dof_desc.DD_VOLUME, + dd=DD_VOLUME_ALL, _use_geoderiv_connection=_use_geoderiv_connection ).as_vector(dtype=object)) ) @@ -768,7 +784,7 @@ def _normal(): return actx.thaw(_normal()) -def normal(actx: ArrayContext, dcoll: DiscretizationCollection, dd, +def normal(actx: ArrayContext, dcoll: DiscretizationCollection, dd: DOFDesc, *, _use_geoderiv_connection=None): """Get the unit normal to the specified surface discretization, *dd*. This supports both volume discretizations @@ -798,8 +814,8 @@ def normal(actx: ArrayContext, dcoll: DiscretizationCollection, dd, # {{{ Curvature computations def second_fundamental_form( - actx: ArrayContext, dcoll: DiscretizationCollection, dd=None - ) -> np.ndarray: + actx: ArrayContext, dcoll: DiscretizationCollection, + dd: Optional[DOFDesc] = None) -> np.ndarray: r"""Computes the second fundamental form: .. math:: @@ -817,7 +833,7 @@ def second_fundamental_form( """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dim = dcoll.discr_from_dd(dd).dim normal = rel_mv_normal(actx, dcoll, dd=dd).as_vector(dtype=object) @@ -846,7 +862,7 @@ def second_fundamental_form( def shape_operator(actx: ArrayContext, dcoll: DiscretizationCollection, - dd=None) -> np.ndarray: + dd: Optional[DOFDesc] = None) -> np.ndarray: r"""Computes the shape operator (also called the curvature tensor) containing second order derivatives: @@ -871,7 +887,7 @@ def shape_operator(actx: ArrayContext, dcoll: DiscretizationCollection, def summed_curvature(actx: ArrayContext, dcoll: DiscretizationCollection, - dd=None) -> DOFArray: + dd: Optional[DOFDesc] = None) -> DOFArray: r"""Computes the sum of the principal curvatures: .. math:: @@ -888,7 +904,7 @@ def summed_curvature(actx: ArrayContext, dcoll: DiscretizationCollection, """ if dd is None: - dd = DD_VOLUME + dd = DD_VOLUME_ALL dim = dcoll.ambient_dim - 1 diff --git a/grudge/grudge_array_context.py b/grudge/grudge_array_context.py new file mode 100644 index 000000000..cbec3d72b --- /dev/null +++ b/grudge/grudge_array_context.py @@ -0,0 +1,1688 @@ +from meshmode.array_context import PyOpenCLArrayContext +from grudge.array_context import MPIPyOpenCLArrayContext +from pytools import memoize_method, memoize_in, memoize +import loopy as lp +import pyopencl as cl +import pyopencl.array as cla +import numpy as np + +import grudge.loopy_dg_kernels as dgk +from grudge.grudge_tags import (IsDOFArray, IsSepVecDOFArray, IsFaceDOFArray, + IsOpArray, IsSepVecOpArray, ParameterValue, IsFaceMassOpArray, KernelDataTag, + IsVecDOFArray, IsVecOpArray, IsFourAxisDOFArray, EinsumArgsTags) + +from arraycontext.impl.pyopencl.fake_numpy import (PyOpenCLFakeNumpyNamespace) +from arraycontext.container.traversal import (rec_map_array_container, + multimapped_over_array_containers) + +from hashlib import md5 +import hjson +import os +import pickle +from os.path import exists + +from grudge.loopy_dg_kernels.run_tests import (generic_test, random_search, + exhaustive_search, exhaustive_search_v2) +from arraycontext.container.traversal import rec_multimap_array_container +from typing import Optional + +#from grudge.loopy_dg_kernels.run_tests import analyzeResult + +try: + import importlib.resources as pkg_resources +except ImportError: + # Use backported version for python < 3.7 + import importlib_resources as pkg_resources + +ctof_knl_base = lp.make_copy_kernel("f,f", old_dim_tags="c,c") +ctof_knl = lp.make_kernel(ctof_knl_base.default_entrypoint.domains, + ctof_knl_base.default_entrypoint.instructions, + default_offset=lp.auto) +ctof_knl = lp.tag_array_axes(ctof_knl, "input", "c,c") +ctof_knl = lp.tag_array_axes(ctof_knl, "output", "f,f") + +#ftoc_knl = lp.make_copy_kernel("c,c", old_dim_tags="f,f") + +def get_transformation_id(device_id): + hjson_file = pkg_resources.open_text(dgk, "device_mappings.hjson") + hjson_text = hjson_file.read() + hjson_file.close() + od = hjson.loads(hjson_text) + return od[device_id] + +def get_fp_string(dtype): + return "FP64" if dtype == np.float64 else "FP32" + +#def get_order_from_dofs(dofs): +# dofs_to_order = {10: 2, 20: 3, 35: 4, 56: 5, 84: 6, 120: 7} +# return dofs_to_order[dofs] + + +def fix_program_parameters(program): + for arg in program.default_entrypoint.args: + for tag in arg.tags: + if isinstance(tag, ParameterValue): + program = lp.fix_parameters(program, **{arg.name: tag.value}) + return program + +def set_memory_layout(program, order="F"): + # This assumes arguments have only one tag + if order == "F": + for arg in program.default_entrypoint.args: + if IsDOFArray() in arg.tags: + program = lp.tag_array_axes(program, arg.name, "f,f") + elif IsSepVecDOFArray() in arg.tags: + program = lp.tag_array_axes(program, arg.name, "sep,f,f") + elif IsSepVecOpArray() in arg.tags: + program = lp.tag_array_axes(program, arg.name, "sep,c,c") + elif IsFaceDOFArray() in arg.tags: + # Why is this the data layout with fortran ordering? + program = lp.tag_array_axes(program, arg.name, "N1,N0,N2") + elif IsVecDOFArray() in arg.tags: + program = lp.tag_array_axes(program, arg.name, "N2,N0,N1") + elif IsVecOpArray() in arg.tags or IsFaceMassOpArray() in arg.tags: + program = lp.tag_array_axes(program, arg.name, "c,c,c") + elif IsFourAxisDOFArray() in arg.tags: + program = lp.tag_array_axes(program, arg.name, "N3,N2,N0,N1") + + #for arg in program.default_entrypoint.args: + # for tag in arg.tags: + # if isinstance(tag, ParameterValue): + # program = lp.fix_parameters(program, **{arg.name: tag.value}) + program = fix_program_parameters(program) + program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True)) + return program + + +# {{{ _get_scalar_func_loopy_program + +def _get_scalar_func_loopy_program(actx, c_name, nargs, axis_lengths): + @memoize_in(actx, _get_scalar_func_loopy_program) + def get(c_name, nargs, naxes): + from pymbolic import var + naxes = len(axis_lengths) + + var_names = ["i%d" % i for i in range(naxes)] + size_names = ["n%d" % i for i in range(naxes)] + subscript = tuple(var(vname) for vname in var_names) + from islpy import make_zero_and_vars + v = make_zero_and_vars(var_names, params=size_names) + domain = v[0].domain() + for vname, sname in zip(var_names, size_names): + domain = domain & v[0].le_set(v[vname]) & v[vname].lt_set(v[sname]) + + domain_bset, = domain.get_basic_sets() + + from arraycontext.loopy import make_loopy_program + from arraycontext.transform_metadata import ElementwiseMapKernelTag + + tags = [IsDOFArray()] if naxes > 1 else [] + kernel_data = [ + lp.GlobalArg("inp%d" % i, None, shape=tuple(size_names), tags=tags) + for i in range(nargs)] + kernel_data.append( + lp.GlobalArg("out", None, shape=tuple(size_names), tags=tags)) + #for name, val in zip(size_names, axis_lengths): + # kernel_data.append(lp.ValueArg(name, tags=[ParameterValue(val)])) + kernel_data.append(...) + + prg = make_loopy_program( + [domain_bset], + [ + lp.Assignment( + var("out")[subscript], + var(c_name)(*[ + var("inp%d" % i)[subscript] for i in range(nargs)])) + ], + kernel_data=kernel_data, + name="actx_special_%s" % c_name, + tags=(ElementwiseMapKernelTag(),)) + + return prg + + return get(c_name, nargs, axis_lengths) + +# }}} + + +class GrudgeFakeNumpyNamespace(PyOpenCLFakeNumpyNamespace): + + # ¿Debería este ser más inteligente? + # This function has no idea if `a` is in flattened C or F order. Should it be assumed to be in "C" layout? + def reshape(self, a, newshape, order="C"): # Order here is the input layout or output layout? + #print("================CALLING RESHAPE================") + #print(type(a)) + #assert np.allclose(a.reshape(newshape, order="F").get(), a.reshape(newshape, order="C").get()) + + return rec_map_array_container( + lambda ary: ctof_knl(self._array_context.queue, input=ary.reshape(newshape, order="C"))[1][0], a) + # Need to override the default for now. + + # Could be problematic. Unflatten has no idea if the data has been changed from "F" layout to + # (flattened) "C" layout so when order="F" is specified data is moved around. + # Maybe some tags should be attached to the flattened arrays? + def ravel(self, a, order="C"): # Order here is the output layout + def _rec_ravel(a): + # Couldn't this be accomplished with an ftoc kernel followed by an a.reshape? + if order == "C" and len(a.shape) == 2 and a.flags.f_contiguous: + @memoize_in(self._array_context, (_rec_ravel, "flatten_grp_ary_prg")) + def prg(): + from arraycontext import make_loopy_program + t_unit = make_loopy_program( + [ + "{[iel]: 0 <= iel < nelements}", + "{[idof]: 0 <= idof < ndofs_per_element}" + ], + """ + result[iel * ndofs_per_element + idof] = grp_ary[iel, idof] + """, + [ + lp.GlobalArg("result", None, + shape="nelements * ndofs_per_element"), + lp.GlobalArg("grp_ary", None, + shape=("nelements", "ndofs_per_element"), tags=[IsDOFArray()]), + lp.ValueArg("nelements", np.int32), + lp.ValueArg("ndofs_per_element", np.int32), + "..." + ], + name="flatten_grp_ary" + ) + return t_unit + #return lp.tag_inames(t_unit, { + # "iel": ConcurrentElementInameTag(), + # "idof": ConcurrentDOFInameTag()}) + + result = self._array_context.call_loopy(prg(), grp_ary=a)["result"] + return result + elif order in "FC": + return a.reshape(-1, order=order) + elif order == "A": + # TODO: upstream this to pyopencl.array + if a.flags.f_contiguous: + return a.reshape(-1, order="F") + elif a.flags.c_contiguous: + return a.reshape(-1, order="C") + else: + raise ValueError("For `order='A'`, array should be either" + " F-contiguous or C-contiguous.") + elif order == "K": + raise NotImplementedError("PyOpenCLArrayContext.np.ravel not " + "implemented for 'order=K'") + else: + raise ValueError("`order` can be one of 'F', 'C', 'A' or 'K'. " + f"(got {order})") + + return rec_map_array_container(_rec_ravel, a) + + + def stack(self,arrays, axis=0): + from pytools.obj_array import make_obj_array + + if not axis == 0: + raise NotImplementedError("Axes other than 0 are not currently supported") + + def _stack(arrays, queue): + + #print(len(arrays)) + #print(arrays[0].shape) + #print(arrays[0].strides) + + # This sorts the strides from lowest to highest and then + # uses their original indices to create a list of "N{i}" + # strings. + + ndims = len(arrays[0].shape) + lp_strides_ordered = np.array([f"N{i}" for i in range(ndims)]) + lp_strides = np.empty_like(lp_strides_ordered) + sorted_estrides = np.array(sorted(list(enumerate(arrays[0].strides)), key=lambda tup : tup[1])) + for i, j in enumerate(sorted_estrides[:,0]): + lp_strides[j] = lp_strides_ordered[i] + + lp_strides_out = [f"N{ndims}"] + list(lp_strides) + lp_strides_in = ["sep"] + list(lp_strides) + + # Loopy errors with this, constructing string instead + #prg = lp.make_copy_kernel(lp_strides_out, old_dim_tags=lp_strides_in) + + # Loopy errors when try to use the lp_strides lists directly + str_strides_in = "" + str_strides_out = "" + + for s0, s1 in zip(lp_strides_out, lp_strides_in): + str_strides_out += s0 + "," + str_strides_in += s1 + "," + str_strides_out = str_strides_out[:len(str_strides_out) - 1] + str_strides_in = str_strides_in[:len(str_strides_in) - 1] + + #print(arrays[0].strides) + #print(str_strides_in) + #print(str_strides_out) + + prg = lp.make_copy_kernel(str_strides_out, old_dim_tags=str_strides_in) + + # Fix the kernel parameters + d = {"n{}".format(i+1): n for i,n in enumerate(arrays[0].shape)} + d["n0"] = len(arrays) + prg = lp.fix_parameters(prg, **d) + + # Should call_loopy be used instead? Probably. No reason no to + result = prg(queue, input=make_obj_array(arrays))[1][0] + #print(result.shape) + return result + + return rec_multimap_array_container( + lambda *args: _stack(args, self._array_context.queue), + *arrays) + + #return rec_multimap_array_container( + # lambda *args: cla.stack(arrays=args, axis=axis, + # queue=self._array_context.queue), + # *arrays) + + def __getattr__(self, name): + def loopy_implemented_elwise_func(*args): + if all(np.isscalar(ary) for ary in args): + return getattr( + np, self._c_to_numpy_arc_functions.get(name, name) + )(*args) + actx = self._array_context + prg = _get_scalar_func_loopy_program(actx, + c_name, nargs=len(args), axis_lengths=args[0].shape) + #for arg in args: + #print("Input dtype:", arg.dtype) + #print("Input shape:", arg.shape) + #print("Input strides:", arg.strides) + #print("Input Sum:", cla.sum(arg)) + ##print("Input Max:", cla.max(arg)) + ##print("Input Min:", cla.min(arg)) + #print("Input numpy:", np.sum(np.abs(arg.get()))) + #if arg.shape == (0,2): + # print("Input array:", arg.get()) + #cargs = [] + #for arg in args: + # print( + #evt, (out,) = ftoc_knl(self._array_context.queue, input=arg) + # cargs.append(out) + # Workaround + #if len(args) == 1 and args[0].shape[0] == 0: + # return args[0] + #print(prg) + + outputs = actx.call_loopy(prg, + #**{"inp%d" % i: cargs[i] for i, arg in enumerate(args)}) + **{"inp%d" % i: arg for i, arg in enumerate(args)}) + + #print("PyOpenCL Output sum:", cla.sum(outputs["out"])) + #print("Output numpy:", np.sum(np.abs(outputs["out"].get()))) + #1/0 + #exit() + return outputs["out"] + + if name in self._c_to_numpy_arc_functions: + from warnings import warn + warn(f"'{name}' in ArrayContext.np is deprecated. " + f"Use '{self._c_to_numpy_arc_functions[name]}' as in numpy. " + "The old name will stop working in 2022.", + DeprecationWarning, stacklevel=3) + + # normalize to C names anyway + c_name = self._numpy_to_c_arc_functions.get(name, name) + + # limit which functions we try to hand off to loopy + if (name in self._numpy_math_functions + or name in self._c_to_numpy_arc_functions): + return multimapped_over_array_containers(loopy_implemented_elwise_func) + else: + raise AttributeError( + f"'{type(self._array_context).__name__}.np' object " + f"has no attribute '{name}'") + + """ Old version + def __getattr__(self, name): + def loopy_implemented_elwise_func(*args): + actx = self._array_context + prg = _get_scalar_func_loopy_program(actx, + c_name, nargs=len(args), naxes=len(args[0].shape)) + outputs = actx.call_loopy(prg, + **{"inp%d" % i: arg for i, arg in enumerate(args)}) + return outputs["out"] + + if name in self._c_to_numpy_arc_functions: + from warnings import warn + warn(f"'{name}' in ArrayContext.np is deprecated. " + "Use '{c_to_numpy_arc_functions[name]}' as in numpy. " + "The old name will stop working in 2021.", + DeprecationWarning, stacklevel=3) + + # normalize to C names anyway + c_name = self._numpy_to_c_arc_functions.get(name, name) + + # limit which functions we try to hand off to loopy + if name in self._numpy_math_functions: + return multimapped_over_array_containers(loopy_implemented_elwise_func) + else: + raise AttributeError(name) + """ + +# The PyOpenCLArrayContext needs this since the array dimensions are +# Maybe the parameter fixing should be moved into the PyOpenCLArrayContext +class ParameterFixingPyOpenCLArrayContext(MPIPyOpenCLArrayContext): + + @memoize_method + def transform_loopy_program(self, program): + + program = set_memory_layout(program, order="C") + #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True)) + # Set no_numpy and return_dict options here? + #for arg in program.default_entrypoint.args: + # for tag in arg.tags: + # if isinstance(tag, ParameterValue): + # program = lp.fix_parameters(program, **{arg.name: tag.value}) + + #program = super().transform_loopy_program(program) + return program + + + def call_loopy(self, program, **kwargs): + + #print(program) + result = super().call_loopy(program, **kwargs) + + queue_properties = self.queue.get_info(cl.command_queue_info.PROPERTIES) + profiling_enabled = cl.command_queue_properties.PROFILING_ENABLE + profiling_is_enabled = queue_properties & profiling_enabled == profiling_enabled + + #try: # Only if profiling is enabled + if profiling_is_enabled: + + evt = None + for val in result.values(): + if isinstance(val, cla.Array): + if val.events is not None and len(val.events) > 0: + evt = val.events[0] + break + + #evt = result["evt"] + evt.wait() + dt = evt.profile.end - evt.profile.start + print("Clock ticks:", dt) + dt = dt / 1e9 + + nbytes = 0 + # Could probably just use program.default_entrypoint.args but maybe all + # parameters are not set + if "resample_by_mat" in program.default_entrypoint.name: + n_to_nodes, n_from_nodes = kwargs["resample_mat"].shape + nbytes = (kwargs["to_element_indices"].shape[0]*n_to_nodes + + n_to_nodes*n_from_nodes + + kwargs["from_element_indices"].shape[0]*n_from_nodes) * 8 + elif program.default_entrypoint.name == "resample_by_picking_group": + nelements = kwargs["from_element_indices"].shape[0] + dpl1, nunit_dofs_tgt = kwargs["dof_pick_lists"].shape + ary_bytes = kwargs["ary"].dtype.itemsize + dpl_bytes = kwargs["dof_pick_lists"].dtype.itemsize + dpli_bytes = kwargs["dof_pick_list_indices"].dtype.itemsize + fei_bytes = kwargs["from_element_indices"].dtype.itemsize + # Data from source and target + the indirections arrays + # Assume indirection arrays and data arrays are fetched only once + nbytes = 2*nelements*nunit_dofs_tgt*ary_bytes + nbytes += nelements*fei_bytes + nelements*dpli_bytes + nunit_dofs_tgt*dpl1*dpl_bytes + elif "resample_by_picking" in program.default_entrypoint.name: + # Double check this - this may underestimate the number of bytes transferred + print("Inaccurate byte count for resample_by_picking") + """ + if "rhs" not in program.default_entrypoint.name: + nbytes = kwargs["pick_list"].shape[0] * (kwargs["from_element_indices"].shape[0] + + kwargs["to_element_indices"].shape[0])*8 + else: + nbytes = kwargs["pick_list"].shape[0] * (kwargs["from_element_indices"].shape[0])*8 + """ + else: + # This won't work because not all kernels have dimensions specified + #for arg in program.default_entrypoint.args: + # nbytes += arg.dtype.dtype.itemsize*np.prod(arg.shape) + for key, val in kwargs.items(): + # output may be a list of pyopenclarrays or it could be a + # pyopenclarray. This prevents double counting (allowing + # other for-loop to count the bytes in the former case) + if key not in result.keys(): + try: + nbytes += np.prod(val.shape)*8 + except AttributeError: + nbytes += 0 # Or maybe 1*8 if this is a scalar + for val in result.values(): + try: + nbytes += np.prod(val.shape)*8 + except AttributeError: + nbytes += 0 # Or maybe this is a scalar? + bw = nbytes / dt / 1e9 + + print("Kernel {}, Time {}, Bytes {}, Bandwidth {}".format(program.default_entrypoint.name, dt, nbytes, bw)) + + #except cl._cl.RuntimeError as e: + # pass + + return result + + #@memoize_method # Somehow causes a shape mismatch + def _wrap_get_einsum_prg(self, spec, arg_names, tagged): + + prg = self._get_einsum_prg(spec, arg_names, tagged) + for tag in tagged: + if isinstance(tag, KernelDataTag): + ep = prg.default_entrypoint + prg = lp.make_kernel(ep.domains, ep.instructions, kernel_data=tag.kernel_data, name=ep.name) + return prg + + + def einsum(self, spec, *args, arg_names=None, tagged=()): + """Computes the result of Einstein summation following the + convention in :func:`numpy.einsum`. + + :arg spec: a string denoting the subscripts for + summation as a comma-separated list of subscript labels. + This follows the usual :func:`numpy.einsum` convention. + Note that the explicit indicator `->` for the precise output + form is required. + :arg args: a sequence of array-like operands, whose order matches + the subscript labels provided by *spec*. + :arg arg_names: an optional iterable of string types denoting + the names of the *args*. If *None*, default names will be + generated. + :arg tagged: an optional sequence of :class:`pytools.tag.Tag` + objects specifying the tags to be applied to the operation. + + :return: the output of the einsum :mod:`loopy` program + """ + if arg_names is None: + arg_names = tuple("arg%d" % i for i in range(len(args))) + + td = None + for tag in tagged: + if isinstance(tag, EinsumArgsTags): + td = tag.tags_map + + if td is not None: + prg = self._get_einsum_prg(spec, arg_names, tagged) + + arg_spec, out_spec = spec.split("->") + dim_dict = {} + kernel_data = [] + + # Are there always as many arg_specs as there are args? + for index_chars, arg, name, in zip(arg_spec.split(","), args, arg_names): + dim_dict.update(dict(zip(index_chars, arg.shape))) + kd = lp.GlobalArg(name, arg.dtype, shape=arg.shape, offset=lp.auto, tags=td.get(name)) + kernel_data.append(kd) + out_shape = tuple([dim_dict[index_char] for index_char in out_spec]) + # TODO: More robust way to find output dtype + kd = lp.GlobalArg("out", args[-1].dtype, shape=out_shape, + offset=lp.auto, tags=td.get("out"), is_output=True) + kernel_data.append(kd) + for key, value in dim_dict.items(): + kernel_data.append(lp.ValueArg(f"N{key}", tags=[ParameterValue(value)])) + kernel_data.append(...) + + ep = prg.default_entrypoint + prg = lp.make_kernel(ep.domains, ep.instructions, kernel_data=kernel_data, name=ep.name) + else: + prg = self._wrap_get_einsum_prg(spec, arg_names, tagged) + + return self.call_loopy( + prg, **{arg_names[i]: arg for i, arg in enumerate(args)} + )["out"] + + +class FortranOrderedArrayContext(ParameterFixingPyOpenCLArrayContext): + + def _get_fake_numpy_namespace(self): + return GrudgeFakeNumpyNamespace(self) + + def empty(self, shape, dtype): + return cla.empty(self.queue, shape=shape, dtype=dtype, + allocator=self.allocator, order="F") + + def zeros(self, shape, dtype): + return cla.zeros(self.queue, shape=shape, dtype=dtype, + allocator=self.allocator, order="F") + + def thaw(self, array): + #print("THAWING", array.shape) + thawed = super().thaw(array) + #print("Shape:", thawed.shape) + #print("C_contiguous:", array.flags.c_contiguous) + #print("F_contiguous:", array.flags.f_contiguous) + if hasattr(thawed, "shape") and len(thawed.shape) == 2 and array.flags.c_contiguous and not array.flags.f_contiguous: + result = self.call_loopy(ctof_knl, **{"input": thawed}) + #print("CALLED CTOF") + #assert cla.sum(thawed - result["output"]) == 0 + #exit() + thawed = result["output"] + + #result = ctof_knl(thawed.queue, input=thawed) + #evt, (out,) = ctof_knl(thawed.queue, input=thawed) + #print("CALLED CTOF") + #thawed = out + + return thawed + + + def from_numpy(self, np_array: np.ndarray): + cl_a = super().from_numpy(np_array) + tags = getattr(np_array, "tags", None) + if tags is not None and IsDOFArray() in tags: + # Should this call go through the array context? + print("CHANGING LAYOUT OF INPUT NUMPY ARRAY In from_numpy") + evt, (out,) = ctof_knl(self.queue, input=cl_a) + cl_a = out + return cl_a + + + def transform_loopy_program(self, program): + #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True)) + program = set_memory_layout(program, order="F") + + # This should probably be a separate function + #for arg in program.default_entrypoint.args: + # for tag in arg.tags: + # if isinstance(tag, ParameterValue): + # program = lp.fix_parameters(program, **{arg.name: tag.value}) + + # PyOpenCLArrayContext default transformations can't handle fortran ordering + #program = super().transform_loopy_program(program) + return program + + +class KernelSavingArrayContext(FortranOrderedArrayContext): +#class KernelSavingArrayContext(ParameterFixingPyOpenCLArrayContext): + def __init__(self, + mpi_communicator, + queue: "pyopencl.CommandQueue", + *, allocator: Optional["pyopencl.tools.AllocatorInterface"] = None, + wait_event_queue_length: Optional[int] = None, + force_device_scalars: bool = False, + save_dir: str = "./pickled_programs") -> None: + + # Currently placed in cwd + self.save_dir = save_dir + os.makedirs(self.save_dir, exist_ok=True) + + super().__init__(mpi_communicator, queue, allocator=allocator, + wait_event_queue_length=wait_event_queue_length, + force_device_scalars=force_device_scalars) + + def transform_loopy_program(self, program): + + if program.default_entrypoint.name in autotuned_kernels: + + # Needs to be set here so autotuner knows dimensions for test data + program = set_memory_layout(program, order="F") + #program = fix_program_parameters(program) + pid = unique_program_id(program) + + # Is there a possible race condition in the multirank case? + # Is there a way to obtain the current rank? + file_path = f"{self.save_dir}/{program.default_entrypoint.name}_{pid}.pickle" + + if not exists(file_path): + # For some reason this doesn't create the directory + print(program.default_entrypoint) + print("====WRITING PROGRAM TO FILE===", file_path) + out_file = open(file_path, "wb") + pickle.dump(program, out_file) + out_file.close() + # Check that the identifier is the same. + print("====READING PROGRAM FROM FILE===", file_path) + f = open(file_path, "rb") + loaded = pickle.load(f) + f.close() + pid2 = unique_program_id(loaded) + #print(pid, pid2) + assert pid == pid2 + + else: + print("PICKLED FILE ALREADY EXISTS", file_path) + #else: + program = super().transform_loopy_program(program) + + return program + +class COrderedKernelSavingArrayContext(ParameterFixingPyOpenCLArrayContext): + def __init__(self, + mpi_communicator, + queue: "pyopencl.CommandQueue", + *, allocator: Optional["pyopencl.tools.AllocatorInterface"] = None, + wait_event_queue_length: Optional[int] = None, + force_device_scalars: bool = False, + save_dir: str = "./pickled_programs") -> None: + + # Currently placed in cwd + self.save_dir = save_dir + os.makedirs(self.save_dir, exist_ok=True) + + super().__init__(mpi_communicator, queue, allocator=allocator, + wait_event_queue_length=wait_event_queue_length, + force_device_scalars=force_device_scalars) + + def transform_loopy_program(self, program): + + if program.default_entrypoint.name in autotuned_kernels: + + # Needs to be set here so autotuner knows dimensions for test data + program = set_memory_layout(program, order="C") + #program = fix_program_parameters(program) + pid = unique_program_id(program) + + # Is there a possible race condition in the multirank case? + # Is there a way to obtain the current rank? + file_path = f"{self.save_dir}/{program.default_entrypoint.name}_{pid}.pickle" + + if not exists(file_path): + # For some reason this doesn't create the directory + print(program.default_entrypoint) + print("====WRITING PROGRAM TO FILE===", file_path) + out_file = open(file_path, "wb") + pickle.dump(program, out_file) + out_file.close() + # Check that the identifier is the same. + print("====READING PROGRAM FROM FILE===", file_path) + f = open(file_path, "rb") + loaded = pickle.load(f) + f.close() + pid2 = unique_program_id(loaded) + #print(pid, pid2) + assert pid == pid2 + + else: + print("PICKLED FILE ALREADY EXISTS", file_path) + + program = super().transform_loopy_program(program) + + return program + + + + +# This class could be used for some set of default transformations +class GrudgeArrayContext(FortranOrderedArrayContext): + + @memoize_method + def transform_loopy_program(self, program): + #print(program.default_entrypoint.name) + + #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True)) + + + #device_id = "NVIDIA Titan V" + #transform_id = get_transformation_id(device_id) + + # Static (non-autotuned) transformations for the GPU + # This needs to be fixed for new resample by picking kernel + ary_itemsize = 8 # Assume doubles + if "resample_by_picking" in program.default_entrypoint.name: + for arg in program.default_entrypoint.args: + print(arg.name, arg.tags) + if arg.name == "nunit_dofs_tgt" or arg.name == "n_to_nodes": + # Assumes this has has a single ParameterValue tag + n_to_nodes = arg.tags[0].value + elif arg.name == "nelements": + nelements = arg.tags[0].value + elif arg.name == "ary": + ary_itemsize = arg.dtype.dtype.itemsize + + l1 = min(n_to_nodes, 32) + outer = min(nelements, 128) + l0 = min(nelements, 32)#32#((1024 // n_to_nodes) // 32) * 32 # Closest multiple of 32 to 1024 // n_to_nodes + #if l0 == 0: + # l0 = 16 + #if n_to_nodes*16 > 1024: + # l0 = 8 + + #outer = 128#max(l0, 32) + # Prefetch ary if it can fit in shared memory + + # Broken, plus if elements are fetched only once this helps not. + #if nelements*n_to_nodes <= self.queue.device.local_mem_size // ary_itemsize: + # program = lp.add_prefetch(program, "ary", "iel,idof", temporary_address_space=lp.AddressSpace.LOCAL, default_tag="l.auto") + + #program = set_memory_layout(program) + if nelements*n_to_nodes > 0: + if nelements*n_to_nodes <= self.queue.device.max_work_group_size: + program = lp.split_iname(program, "iel", nelements, outer_tag="g.0", + inner_tag="l.0", slabs=(0,0)) + program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1", + inner_tag="l.1", slabs=(0,0)) + else: + slabs = (0,0) if outer == nelements else (0,1) + program = lp.split_iname(program, "iel", outer, outer_tag="g.0", + slabs=slabs) + program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp", + inner_tag="l.0", slabs=(0,0)) + slabs = (0,0) if l1 == n_to_nodes else (0,1) + program = lp.split_iname(program, "idof", l1, outer_tag="g.1", + inner_tag="l.1", slabs=slabs) + + + #program = lp.add_inames_for_unused_hw_axes(program) + #program = lp.set_options(program, "write_cl") + elif "actx_special" in program.default_entrypoint.name: # Fixed + #program = set_memory_layout(program) + # Sometimes sqrt is called on single values. + if "i0" in program.default_entrypoint.inames: + program = lp.split_iname(program, "i0", 512, outer_tag="g.0", + inner_tag="l.0", slabs=(0, 1)) + #program = lp.split_iname(program, "i0", 128, outer_tag="g.0", + # slabs=(0,1)) + #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp", + # inner_tag="l.0") + #program = lp.split_iname(program, "i1", 20, outer_tag="g.1", + # inner_tag="l.1", slabs=(0,0)) + #program2 = lp.join_inames(program, ("i1", "i0"), "i") + #from islpy import BasicMap + #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}") + #program2 = lp.map_domain(program, m) + #print(program2) + #exit() + + #program = super().transform_loopy_program(program) + #print(program) + #print(lp.generate_code_v2(program).device_code()) + + # Not really certain how to do grudge_assign, done for flatten + elif "flatten" in program.default_entrypoint.name: + + #program = set_memory_layout(program) + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + program = lp.split_iname(program, "iel", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", 20, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + # ctof kernel + elif "loopy_kernel" in program.default_entrypoint.name: + + #program = set_memory_layout(program) + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + print("TRANSFORMING CTOF KERNEL") + program = lp.split_iname(program, "i0", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "i1", 32, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + + elif "einsum3to1_kernel" in program.default_entrypoint.name: + + Ne = 0 + for arg in program.default_entrypoint.args: + if arg.name == "Ne": + Ne = arg.tags[0].value + + if Ne != 0: + program = lp.split_iname(program, "e", 128, outer_tag="g.0", slabs=(0,1)) + program = lp.split_iname(program, "e_inner", 32, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) + program = lp.prioritize_loops(program, "f,j") + + #else: + #print(program) + #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name) + # The PyOpenCLArrayContext transformations can fail when inames are fixed. + program = super().transform_loopy_program(program) + + return program + + +class COrderedGrudgeArrayContext(ParameterFixingPyOpenCLArrayContext): + + @memoize_method + def transform_loopy_program(self, program): + #print(program.default_entrypoint.name) + + #program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True)) + + + #device_id = "NVIDIA Titan V" + #transform_id = get_transformation_id(device_id) + + # Static (non-autotuned) transformations for the GPU + # This needs to be fixed for new resample by picking kernel + ary_itemsize = 8 # Assume doubles + if "resample_by_picking" in program.default_entrypoint.name: + for arg in program.default_entrypoint.args: + print(arg.name, arg.tags) + if arg.name == "nunit_dofs_tgt" or arg.name == "n_to_nodes": + # Assumes this has has a single ParameterValue tag + n_to_nodes = arg.tags[0].value + elif arg.name == "nelements": + nelements = arg.tags[0].value + elif arg.name == "ary": + ary_itemsize = arg.dtype.dtype.itemsize + + l1 = min(n_to_nodes, 32) + outer = min(nelements, 128) + l0 = min(nelements, 32)#32#((1024 // n_to_nodes) // 32) * 32 # Closest multiple of 32 to 1024 // n_to_nodes + #if l0 == 0: + # l0 = 16 + #if n_to_nodes*16 > 1024: + # l0 = 8 + + #outer = 128#max(l0, 32) + # Prefetch ary if it can fit in shared memory + + # Broken, plus if elements are fetched only once this helps not. + #if nelements*n_to_nodes <= self.queue.device.local_mem_size // ary_itemsize: + + #program = set_memory_layout(program) + #program = lp.add_prefetch(program, "dof_pick_lists", temporary_address_space=lp.AddressSpace.LOCAL) + if nelements*n_to_nodes > 0: + if nelements*n_to_nodes <= self.queue.device.max_work_group_size: + program = lp.split_iname(program, "iel", nelements, outer_tag="g.0", + inner_tag="l.0", slabs=(0,0)) + program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1", + inner_tag="l.1", slabs=(0,0)) + #program = lp.add_prefetch(program, "dof_pick_list_index", "iel_inner", default_tag="l.auto") + #program = lp.add_prefetch(program, "from_element_indices", "iel_inner", default_tag="l.auto") + #program = lp.add_prefetch(program, "dof_pick_lists", "", temporary_address_space=lp.AddressSpace.LOCAL) + else: + slabs = (0,0) if outer == nelements else (0,1) + program = lp.split_iname(program, "iel", outer, outer_tag="g.0", + slabs=slabs) + program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp", + inner_tag="l.0", slabs=(0,0)) + slabs = (0,0) if l1 == n_to_nodes else (0,1) + program = lp.split_iname(program, "idof", l1, outer_tag="g.1", + inner_tag="l.1", slabs=slabs) + # Prefetching these two just slows the kernel, not sure about dof_pick_lists + #program = lp.add_prefetch(program, "dof_pick_list_index", "iel_inner_outer,iel_inner_inner", default_tag="l.auto") + #program = lp.add_prefetch(program, "from_element_indices", "iel_inner_outer,iel_inner_inner", default_tag="l.auto") + #program = lp.add_prefetch(program, "dof_pick_lists", "idof_outer,idof_inner", \ + # temporary_address_space=lp.AddressSpace.LOCAL, default_tag="l.auto") + + program = lp.add_inames_for_unused_hw_axes(program) + #program = lp.set_options(program, "write_cl") + elif "actx_special" in program.default_entrypoint.name: # Fixed + # Sometimes sqrt is called on single values. + + if "i0" in program.default_entrypoint.inames: + program = lp.split_iname(program, "i0", 128, outer_tag="g.0", + slabs=(0,1)) + program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "i1", 32, outer_tag="g.1", + inner_tag="l.1", slabs=(0,1)) + + #program = lp.split_iname(program, "i0", 512, outer_tag="g.0", + # inner_tag="l.0", slabs=(0, 1)) + #print(program) + #exit() + #program = lp.split_iname(program, "i0", 128, outer_tag="g.0", + # slabs=(0,1)) + #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp", + # inner_tag="l.0") + #program = lp.split_iname(program, "i1", 20, outer_tag="g.1", + # inner_tag="l.1", slabs=(0,0)) + #program2 = lp.join_inames(program, ("i1", "i0"), "i") + #from islpy import BasicMap + #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}") + #program2 = lp.map_domain(program, m) + #print(program2) + #exit() + + #program = super().transform_loopy_program(program) + #print(program) + #print(lp.generate_code_v2(program).device_code()) + + # Not really certain how to do grudge_assign, done for flatten + elif "flatten" in program.default_entrypoint.name: + + #program = set_memory_layout(program) + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + program = lp.split_iname(program, "iel", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", 20, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + # ctof kernel + elif "loopy_kernel" in program.default_entrypoint.name: + + #program = set_memory_layout(program) + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + print("TRANSFORMING CTOF KERNEL") + program = lp.split_iname(program, "i0", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "i1", 32, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + elif "einsum3to1_kernel" == program.default_entrypoint.name: + + print("================EINSUM3TO1_KERNEL=====================") + #program = set_memory_layout(program, order="C") + Ne = 0 + for arg in program.default_entrypoint.args: + if arg.name == "Ne": + Ne = arg.tags[0].value + + if Ne != 0: + program = lp.split_iname(program, "e", 128, outer_tag="g.0", slabs=(0,1)) + program = lp.split_iname(program, "e_inner", 32, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) + program = lp.prioritize_loops(program, "f,j") + + #else: + #print(program) + #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name) + # The PyOpenCLArrayContext transformations can fail when inames are fixed. + + program = super().transform_loopy_program(program) + return program + + + +def unique_program_id(program): + #code = lp.generate_code_v2(program).device_code() # Not unique + #return md5(str(program.default_entrypoint).encode()).hexdigest() # Also not unique + + ep = program.default_entrypoint + domains = ep.domains + instr = [str(entry) for entry in ep.instructions] + args = ep.args + name = ep.name + + # Is the name really relevant? + #all_list = [name] + domains + instr + args + # Somehow this can change even if the string is the same + #identifier = md5(str(all_list).encode()).hexdigest() + + """ + print("NAME") + print(name) + print() + print("DOMAINS") + print(domains) + print() + print("INSTRUCTIONS") + print(instr) + print() + print("ARGS") + print(args) + print() + """ + + dstr = md5(str(domains).encode()).hexdigest() #List + istr = md5(str(instr).encode()).hexdigest() #List + astr = md5(str(args).encode()).hexdigest() #List + nstr = md5(name.encode()).hexdigest() + #print("dstr", dstr) + #print("nstr", nstr) + #print("istr", istr) + #print("astr", astr) + #for entry in all_list: + # print(entry) + #print(str(all_list)) + identifier = nstr[:4] + dstr[:4] + istr[:4] + astr[:4] + + return identifier + + +def convert(o): + if isinstance(o, np.generic): return o.item() + raise TypeError + + +# Meshmode and Grudge kernels to autotune +autotuned_kernels = {"einsum3to2_kernel", + "einsum4to2_kernel", + "einsum5to3_kernel", + "einsum2to2_kernel", + "diff", + "lp_nodes", + "grudge_elementwise_sum_knl", + "resample_by_picking_single_indirection", + #"resample_by_picking_group", # Will require implementing a special testing function + "smooth_comp" } # This last one is a mirgecom kernel. Should probably have some class variable. + + +class AutotuningArrayContext(GrudgeArrayContext): + + #@memoize_method #Should this be memoized? + def get_generators(self, program): + + # Maybe the generators should be classes so we can use inheritance. + if program.default_entrypoint.name == "einsum3to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum4to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum5to3_kernel": + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum2to2_kernel" or program.default_entrypoint.name == "resample_by_picking_single_indirection": + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "grudge_elementwise_sum_knl": + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator + else: + from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator + from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator + + return tlist_generator, pspace_generator + + + def autotune_and_save(self, queue, program, search_fn, tlist_generator, + pspace_generator, hjson_file_str, time_limit=np.inf): + from hjson import dump + + try: + avg_time, transformations, data = search_fn(queue, program, generic_test, + pspace_generator, tlist_generator, time_limit=time_limit) + except cl._cl.RuntimeError as e: + print(e) + print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.") + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + + hjson.dump(od, out_file,default=convert) + out_file.close() + print("WRITING TRANSFORMATION FILE:", hjson_file_str) + + return transformations + + @memoize_method + def transform_loopy_program(self, program): + + # Really just need to add metadata to the hjson file + # Could convert the kernel itself to base 64 and store it + # in the hjson file + # TODO: Dynamically determine device id, + device_id = "NVIDIA Titan V" + + print(program.default_entrypoint.name) + print(unique_program_id(program)) + print(program) + + # These are the most compute intensive kernels + to_optimize = {} + if program.default_entrypoint.name in to_optimize: + print(program) + for arg in program.default_entrypoint.args: + print(arg.tags) + exit() + + if program.default_entrypoint.name in autotuned_kernels: + # Set no_numpy and return_dict options here? + #program = fix_program_parameters(program) + program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True)) + program = set_memory_layout(program, order="F") + pid = unique_program_id(program) + os.makedirs(os.getcwd() + "/hjson", exist_ok=True) + hjson_file_str = f"hjson/{program.default_entrypoint.name}_{pid}.hjson" + + try: + # Attempt to read from a transformation file in the current directory first, + # then try to read from the package files - this is not currently implemented + # Maybe should have ability to search in arbitrary specified directories. + + print("Opening file:", hjson_file_str) + hjson_file = open(hjson_file_str, "rt") + + try: # New hjson structure + transformations = dgk.load_transformations_from_file(hjson_file, + ["transformations"]) + print("LOCATED TRANSFORMATION:", hjson_file_str) + #exit() + except KeyError as e: + # This can eventually be removed since we're now using the hash of the program code to specify the file. + # Kernels with different dimensions will have different files. + hjson_file.seek(0,0) # Move read location back to beginning + + fp_format = None + ndofs = None # The value doesn't matter now + transform_id = get_transformation_id(device_id) + + for arg in program.default_entrypoint.args: + if IsOpArray() in arg.tags: + dim = 1 + ndofs = arg.shape[0] + fp_format = arg.dtype.numpy_dtype + break + elif IsSepVecOpArray() in arg.tags or IsVecOpArray() in arg.tags: + ndofs = arg.shape[1] + fp_format = arg.dtype.numpy_dtype + break + elif IsFaceMassOpArray() in arg.tags: + ndofs = arg.shape[0] + fp_format = arg.dtype.numpy_dtype + break + elif IsDOFArray() in arg.tags: + ndofs = arg.shape[1] + fp_format = arg.dtype.numpy_dtype + break + + if fp_format is None: + print("Unknown fp_format") + exit() + if ndofs is None: + print("Unknown ndofs") + exit() + + fp_string = get_fp_string(fp_format) + indices = [transform_id, fp_string, str(ndofs)] + transformations = dgk.load_transformations_from_file(hjson_file, + indices) + + hjson_file.close() + + #except (KeyError, FileNotFoundError) as e: + # There shouldn't be any more key errors now that PIDs are used + except FileNotFoundError as e: + + """ + # Maybe the generators should be classes so we can use inheritance. + if program.default_entrypoint.name == "einsum3to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum4to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum5to3_kernel": + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum2to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "grudge_elementwise_sum_knl": + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator + else: + from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator + from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator + + try: + avg_time, transformations, data = search_fn(self.queue, program, generic_test, + pspace_generator, tlist_generator, time_limit=np.inf) + except cl._cl.RuntimeError as e: + print(e) + print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.") + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + + hjson.dump(od, out_file,default=convert) + out_file.close() + #from pprint import pprint + #pprint(od) + """ + print("TRANSFORMATION FILE NOT FOUND", hjson_file_str) + #exit() + tlist_generator, pspace_generator = self.get_generators(program) + search_fn = exhaustive_search_v2#random_search + transformations = self.autotune_and_save(self.queue, program, search_fn, + tlist_generator, pspace_generator, hjson_file_str) + + program = dgk.apply_transformation_list(program, transformations) + + """ + # Kernels to not autotune. Should probably still load the transformation from a + # generator function. Should these be put in GrudgeArrayContext + + # Maybe this should have an autotuner + # There isn't much room for optimization due to the indirection + elif "resample_by_picking" in program.default_entrypoint.name: + for arg in program.default_entrypoint.args: + if arg.name == "n_to_nodes": + # Assumes this has has a single ParameterValue tag + n_to_nodes = arg.tags[0].value + + l0 = ((1024 // n_to_nodes) // 32) * 32 + if l0 == 0: + l0 = 16 + if n_to_nodes*16 > 1024: + l0 = 8 + c + + outer = max(l0, 32) + + program = set_memory_layout(program) + program = lp.split_iname(program, "iel", outer, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + + elif "actx_special" in program.default_entrypoint.name: # Fixed + program = set_memory_layout(program) + # Sometimes sqrt is called on single values. + if "i0" in program.default_entrypoint.inames: + program = lp.split_iname(program, "i0", 512, outer_tag="g.0", + inner_tag="l.0", slabs=(0, 1)) + #program = lp.split_iname(program, "i0", 128, outer_tag="g.0", + # slabs=(0,1)) + #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp", + # inner_tag="l.0") + #program = lp.split_iname(program, "i1", 20, outer_tag="g.1", + # inner_tag="l.1", slabs=(0,0)) + #program2 = lp.join_inames(program, ("i1", "i0"), "i") + #from islpy import BasicMap + #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}") + #program2 = lp.map_domain(program, m) + #print(program2) + #exit() + + #program = super().transform_loopy_program(program) + #print(program) + #print(lp.generate_code_v2(program).device_code()) + + # Not really certain how to do grudge_assign, done for flatten + elif "flatten" in program.default_entrypoint.name: + + program = set_memory_layout(program) + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + program = lp.split_iname(program, "iel", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", 20, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + + else: + #print(program) + #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name) + # The PyOpenCLArrayContext transformations can fail when inames are fixed. + program = super().transform_loopy_program(program) + + ''' + # These still depend on the polynomial order = 3 + # Never called? + # This is going away anyway probably + elif "resample_by_mat" in program.default_entrypoint.name: + hjson_file = pkg_resources.open_text(dgk, f"{program.default_entrypoint.name}.hjson") + + # Order 3: 10 x 10 + # Order 4: 15 x 35 + + #print(program) + #exit() + pn = 3 # This needs to be not fixed + fp_string = "FP64" + + indices = [transform_id, fp_string, str(pn)] + transformations = dgk.load_transformations_from_file(hjson_file, + indices) + hjson_file.close() + print(transformations) + program = dgk.apply_transformation_list(program, transformations) + + # Not really certain how to do grudge_assign, done for flatten + elif "grudge_assign" in program.default_entrypoint.name or "flatten" in program.default_entrypoint.name: + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + program = lp.split_iname(program, "iel", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", 20, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + + + ''' + """ + else: + # print("USING FALLBACK TRANSFORMATIONS FOR " + program.default_entrypoint.name) + program = super().transform_loopy_program(program) + + return program + + +class COrderedAutotuningArrayContext(COrderedGrudgeArrayContext): + + #@memoize_method #Should this be memoized? + def get_generators(self, program): + + # Maybe the generators should be classes so we can use inheritance. + if program.default_entrypoint.name == "einsum3to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum4to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum5to3_kernel": + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum2to2_kernel" or program.default_entrypoint.name == "resample_by_picking_group": + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "grudge_elementwise_sum_knl": + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator + else: + from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator + from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator + + return tlist_generator, pspace_generator + + + def autotune_and_save(self, queue, program, search_fn, tlist_generator, + pspace_generator, hjson_file_str, time_limit=np.inf): + from hjson import dump + + try: + avg_time, transformations, data = search_fn(queue, program, generic_test, + pspace_generator, tlist_generator, time_limit=time_limit) + except cl._cl.RuntimeError as e: + print(e) + print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.") + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + + hjson.dump(od, out_file,default=convert) + out_file.close() + print("WRITING TRANSFORMATION FILE:", hjson_file_str) + + return transformations + + @memoize_method + def transform_loopy_program(self, program): + + # Really just need to add metadata to the hjson file + # Could convert the kernel itself to base 64 and store it + # in the hjson file + # TODO: Dynamically determine device id, + device_id = "NVIDIA Titan V" + + print(program.default_entrypoint.name) + print(unique_program_id(program)) + print(program) + + # These are the most compute intensive kernels + to_optimize = {}#{"einsum5to3_kernel"}#{"einsum4to2_kernel", "resample_by_picking_group"} + if program.default_entrypoint.name in to_optimize: + print(program) + for arg in program.default_entrypoint.args: + print(arg.tags) + exit() + + if program.default_entrypoint.name in autotuned_kernels: + # Set no_numpy and return_dict options here? + #program = fix_program_parameters(program) + program = lp.set_options(program, lp.Options(no_numpy=True, return_dict=True)) + program = set_memory_layout(program, order="C") + pid = unique_program_id(program) + os.makedirs(os.getcwd() + "/hjson", exist_ok=True) + hjson_file_str = f"hjson/{program.default_entrypoint.name}_{pid}.hjson" + + try: + # Attempt to read from a transformation file in the current directory first, + # then try to read from the package files - this is not currently implemented + # Maybe should have ability to search in arbitrary specified directories. + + print("Opening file:", hjson_file_str) + hjson_file = open(hjson_file_str, "rt") + + try: # New hjson structure + transformations = dgk.load_transformations_from_file(hjson_file, + ["transformations"]) + print("LOCATED TRANSFORMATION:", hjson_file_str) + #exit() + except KeyError as e: + # This can eventually be removed since we're now using the hash of the program code to specify the file. + # Kernels with different dimensions will have different files. + hjson_file.seek(0,0) # Move read location back to beginning + + fp_format = None + ndofs = None # The value doesn't matter now + transform_id = get_transformation_id(device_id) + + for arg in program.default_entrypoint.args: + if IsOpArray() in arg.tags: + dim = 1 + ndofs = arg.shape[0] + fp_format = arg.dtype.numpy_dtype + break + elif IsSepVecOpArray() in arg.tags or IsVecOpArray() in arg.tags: + ndofs = arg.shape[1] + fp_format = arg.dtype.numpy_dtype + break + elif IsFaceMassOpArray() in arg.tags: + ndofs = arg.shape[0] + fp_format = arg.dtype.numpy_dtype + break + elif IsDOFArray() in arg.tags: + ndofs = arg.shape[1] + fp_format = arg.dtype.numpy_dtype + break + + if fp_format is None: + print("Unknown fp_format") + exit() + if ndofs is None: + print("Unknown ndofs") + exit() + + fp_string = get_fp_string(fp_format) + indices = [transform_id, fp_string, str(ndofs)] + transformations = dgk.load_transformations_from_file(hjson_file, + indices) + + hjson_file.close() + + #except (KeyError, FileNotFoundError) as e: + # There shouldn't be any more key errors now that PIDs are used + except FileNotFoundError as e: + + """ + # Maybe the generators should be classes so we can use inheritance. + if program.default_entrypoint.name == "einsum3to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum3to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum4to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum4to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum5to3_kernel": + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum5to3_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "einsum2to2_kernel": + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import einsum2to2_kernel_pspace_generator as pspace_generator + elif program.default_entrypoint.name == "grudge_elementwise_sum_knl": + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_tlist_generator as tlist_generator + from grudge.loopy_dg_kernels.generators import grudge_elementwise_sum_knl_pspace_generator as pspace_generator + else: + from grudge.loopy_dg_kernels.generators import gen_autotune_list as pspace_generator + from grudge.loopy_dg_kernels.generators import mxm_trans_list_generator as tlist_generator + + try: + avg_time, transformations, data = search_fn(self.queue, program, generic_test, + pspace_generator, tlist_generator, time_limit=np.inf) + except cl._cl.RuntimeError as e: + print(e) + print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.") + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + + hjson.dump(od, out_file,default=convert) + out_file.close() + #from pprint import pprint + #pprint(od) + """ + print("TRANSFORMATION FILE NOT FOUND", hjson_file_str) + #exit() + tlist_generator, pspace_generator = self.get_generators(program) + search_fn = exhaustive_search_v2#random_search + transformations = self.autotune_and_save(self.queue, program, search_fn, + tlist_generator, pspace_generator, hjson_file_str) + + program = dgk.apply_transformation_list(program, transformations) + + """ + # Kernels to not autotune. Should probably still load the transformation from a + # generator function. Should these be put in GrudgeArrayContext + + # Maybe this should have an autotuner + # There isn't much room for optimization due to the indirection + elif "resample_by_picking" in program.default_entrypoint.name: + for arg in program.default_entrypoint.args: + if arg.name == "n_to_nodes": + # Assumes this has has a single ParameterValue tag + n_to_nodes = arg.tags[0].value + + l0 = ((1024 // n_to_nodes) // 32) * 32 + if l0 == 0: + l0 = 16 + if n_to_nodes*16 > 1024: + l0 = 8 + c + + outer = max(l0, 32) + + program = set_memory_layout(program) + program = lp.split_iname(program, "iel", outer, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", l0, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", n_to_nodes, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + + elif "actx_special" in program.default_entrypoint.name: # Fixed + program = set_memory_layout(program) + # Sometimes sqrt is called on single values. + if "i0" in program.default_entrypoint.inames: + program = lp.split_iname(program, "i0", 512, outer_tag="g.0", + inner_tag="l.0", slabs=(0, 1)) + #program = lp.split_iname(program, "i0", 128, outer_tag="g.0", + # slabs=(0,1)) + #program = lp.split_iname(program, "i0_inner", 32, outer_tag="ilp", + # inner_tag="l.0") + #program = lp.split_iname(program, "i1", 20, outer_tag="g.1", + # inner_tag="l.1", slabs=(0,0)) + #program2 = lp.join_inames(program, ("i1", "i0"), "i") + #from islpy import BasicMap + #m = BasicMap("[x,y] -> {[n0,n1]->[i]:}") + #program2 = lp.map_domain(program, m) + #print(program2) + #exit() + + #program = super().transform_loopy_program(program) + #print(program) + #print(lp.generate_code_v2(program).device_code()) + + # Not really certain how to do grudge_assign, done for flatten + elif "flatten" in program.default_entrypoint.name: + + program = set_memory_layout(program) + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + program = lp.split_iname(program, "iel", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", 20, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + + else: + #print(program) + #print("USING FALLBACK TRANSORMATIONS FOR " + program.default_entrypoint.name) + # The PyOpenCLArrayContext transformations can fail when inames are fixed. + program = super().transform_loopy_program(program) + + ''' + # These still depend on the polynomial order = 3 + # Never called? + # This is going away anyway probably + elif "resample_by_mat" in program.default_entrypoint.name: + hjson_file = pkg_resources.open_text(dgk, f"{program.default_entrypoint.name}.hjson") + + # Order 3: 10 x 10 + # Order 4: 15 x 35 + + #print(program) + #exit() + pn = 3 # This needs to be not fixed + fp_string = "FP64" + + indices = [transform_id, fp_string, str(pn)] + transformations = dgk.load_transformations_from_file(hjson_file, + indices) + hjson_file.close() + print(transformations) + program = dgk.apply_transformation_list(program, transformations) + + # Not really certain how to do grudge_assign, done for flatten + elif "grudge_assign" in program.default_entrypoint.name or "flatten" in program.default_entrypoint.name: + # This is hardcoded. Need to move this to separate transformation file + #program = lp.set_options(program, "write_cl") + program = lp.split_iname(program, "iel", 128, outer_tag="g.0", + slabs=(0, 1)) + program = lp.split_iname(program, "iel_inner", 32, outer_tag="ilp", + inner_tag="l.0") + program = lp.split_iname(program, "idof", 20, outer_tag="g.1", + inner_tag="l.1", slabs=(0, 0)) + + + ''' + """ + else: + # print("USING FALLBACK TRANSFORMATIONS FOR " + program.default_entrypoint.name) + program = super().transform_loopy_program(program) + + return program + + + +class KernelSavingAutotuningArrayContext(AutotuningArrayContext): + def transform_loopy_program(self, program): + + if program.default_entrypoint.name in autotuned_kernels: + import pickle + # Set no_numpy and return_dict options here? + program = set_memory_layout(program, order="F") + + print("====CALCULATING PROGRAM ID====") + filename = "./pickled_programs" + pid = unique_program_id(program) + + # Is there a way to obtain the current rank? + file_path = f"{filename}/{program.default_entrypoint.name}_{pid}.pickle" + hjson_path = f"hjson/{program.default_entrypoint.name}_{pid}.hjson" + from os.path import exists + + if not exists(file_path): + # For some reason this doesn't create the directory + os.makedirs(os.path.dirname(filename), exist_ok=True) + print(program.default_entrypoint) + print("====WRITING PROGRAM TO FILE===", file_path) + out_file = open(file_path, "wb") + pickle.dump(program, out_file) + out_file.close() + print("====READING PROGRAM FROM FILE===", file_path) + f = open(file_path, "rb") + loaded = pickle.load(f) + f.close() + pid2 = unique_program_id(loaded) + print(pid, pid2) + assert pid == pid2 + print("DUMPED PICKLED FILE. EXITING - RUN THE AUTOTUNER") + elif exists(hjson_path): # Use the transformations + program = super().transform_loopy_program(program) + else: + print("PICKLED FILE ALREADY EXISTS. RUN THE AUTOTUNER.", file_path) + exit() + else: + program = super().transform_loopy_program(program) + + return program + + +# vim: foldmethod=marker diff --git a/grudge/grudge_tags.py b/grudge/grudge_tags.py new file mode 100644 index 000000000..92badc130 --- /dev/null +++ b/grudge/grudge_tags.py @@ -0,0 +1,32 @@ +from pytools.tag import Tag, UniqueTag +from meshmode.transform_metadata import IsDOFArray, IsOpArray, ParameterValue, EinsumArgsTags + +class KernelDataTag(Tag): # Delete this when no longer needed + """A tag that applies to :class:`loopy.LoopKernel`. Kernel data provided + with this tag can be later applied to the kernel. This is used, for + instance, to specify kernel data in einsum kernels.""" + + def __init__(self, kernel_data): + self.kernel_data = kernel_data + + +class IsVecDOFArray(Tag): + pass + +class IsFaceDOFArray(Tag): + pass + +class IsVecOpArray(Tag): + pass + +class IsSepVecDOFArray(Tag): + pass + +class IsSepVecOpArray(Tag): + pass + +class IsFaceMassOpArray(Tag): + pass + +class IsFourAxisDOFArray(Tag): + pass diff --git a/grudge/loopy_dg_kernels/__init__.py b/grudge/loopy_dg_kernels/__init__.py new file mode 100644 index 000000000..5c48b7b8e --- /dev/null +++ b/grudge/loopy_dg_kernels/__init__.py @@ -0,0 +1,425 @@ +import numpy as np +from pytools import memoize_in + +#import pyopencl as cl +#import pyopencl.array +#import pyopencl.clrandom + +import loopy as lp +from grudge.grudge_tags import IsDOFArray, ParameterValue +#from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 +#from loopy.kernel.data import AddressSpace + +#import pycuda.gpuarray as cuarray +#import pycuda.driver as drv +#import pycuda.tools +#import pycuda.autoinit +#from pycuda.compiler import SourceModule +#from pycuda.curandom import rand as curand + +#from modepy import equidistant_nodes + +#from bs4 import UnicodeDammit +import hjson +#import time +#from math import ceil +#import sys + +# setup +# ----- +lp.set_caching_enabled(False) +import loopy.options +loopy.options.ALLOW_TERMINAL_COLORS = False + +# A lot of this could probably be deleted + +def gen_face_mass_knl_merged(nelements, nfaces, nvol_nodes, nface_nodes, fp_format): + knl = lp.make_kernel( + """{[iel,idof,fj]: + 0<=iel 1: + args = args + t[1] + kwargs = t[2] if len(t) > 2 else {} + knl = func(*args, **kwargs) + + return knl diff --git a/grudge/loopy_dg_kernels/device_mappings.hjson b/grudge/loopy_dg_kernels/device_mappings.hjson new file mode 100644 index 000000000..aa90615cc --- /dev/null +++ b/grudge/loopy_dg_kernels/device_mappings.hjson @@ -0,0 +1,8 @@ +{ + # The idea with mapping devices to uuids is that multiple devices can map to + # a single set of transformations. + "NVIDIA Titan V": 72a3ce98-5d21-48bf-b402-6ee96bafd1b6 + "NVIDIA GTX Titan X": 1d7cab16-19bd-4474-95f2-44ed1c0e60df +} + + diff --git a/grudge/loopy_dg_kernels/diff_1d_transform.hjson b/grudge/loopy_dg_kernels/diff_1d_transform.hjson new file mode 100644 index 000000000..289aa61c8 --- /dev/null +++ b/grudge/loopy_dg_kernels/diff_1d_transform.hjson @@ -0,0 +1,149 @@ +# transform ID -> fp format -> pn +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: { + description: "Transformations for the NVIDIA Titan V" + # 64-bit or 32-bit kernel + FP32:{ + # Polynomial order + 2:[ + # Format: [Transformation, args, kwargs] + #["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 3:[ + #["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 4:[ + + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 5:[ + #["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + 6:[ + #["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + #["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + FP64: { + 10:[ + # Format: [Transformation, args, kwargs] + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}], + ["add_inames_for_unused_hw_axes"] + ], + 20:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + # For tests uncomment this + #["split_iname", ["iel", 16], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + #["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + + # For tests comment this + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["diff_mat", "idof_inner_outer,idof_inner_inner,j"], {temporary_name: "matfp", default_tag: "l.auto"}], + #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ["split_iname", ["j", 20], {outer_tag: "for", inner_tag: "for"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"] + ], + 35:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + + ["add_inames_for_unused_hw_axes"] + ], + 56:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + ] + 84:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 120:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + } + 1d7cab16-19bd-4474-95f2-44ed1c0e60df: {} +} diff --git a/grudge/loopy_dg_kernels/diff_2d_transform.hjson b/grudge/loopy_dg_kernels/diff_2d_transform.hjson new file mode 100644 index 000000000..0b48f5519 --- /dev/null +++ b/grudge/loopy_dg_kernels/diff_2d_transform.hjson @@ -0,0 +1,162 @@ +# transform ID -> dimension -> fp format -> pn +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: { + description: "Transformations for the NVIDIA Titan V" + # 64-bit or 32-bit kernel + FP32:{ + # Polynomial order + 2:[ + # Format: [Transformation, args, kwargs] + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 3:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 4:[ + # Move this to array context? + #["tag_array_axes", ["diff_mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ], + 5:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + 6:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + # Not optimized, just copied from 32 bit version + FP64: { + 10:[ + # Format: [Transformation, args, kwargs] + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 101], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}], + ["add_inames_for_unused_hw_axes"] + ], + 20:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 80], {outer_tag: "g.0", slabs:[0,1]}], + # For tests uncomment this + #["split_iname", ["iel", 16], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + + #["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["idof", 20], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + # For tests comment this + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + #["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"] + ], + 35:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"] + ], + 56:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + ] + 84:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 120:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + } + 1d7cab16-19bd-4474-95f2-44ed1c0e60df: {} +} diff --git a/grudge/loopy_dg_kernels/diff_3d_transform.hjson b/grudge/loopy_dg_kernels/diff_3d_transform.hjson new file mode 100644 index 000000000..785b8ecfd --- /dev/null +++ b/grudge/loopy_dg_kernels/diff_3d_transform.hjson @@ -0,0 +1,165 @@ +# transform ID -> fp format -> pn +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: { + description: "Transformations for the NVIDIA Titan V" + # 64-bit or 32-bit kernel + FP32:{ + # Polynomial order + 2:[ + # Format: [Transformation, args, kwargs] + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 3:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 4:[ + # Move this to array context? + #["tag_array_axes", ["diff_mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ], + 5:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + 6:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + FP64: { + 10:[ + # Format: [Transformation, args, kwargs] + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 352], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}], + ["add_inames_for_unused_hw_axes"] + ], + 20:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 288], {outer_tag: "g.0", slabs:[0,1]}], + # For tests uncomment this + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + #["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["idof", 20], {outer_tag: "g.1", slabs:[0,0]}], + ["split_iname", ["idof_inner", 20], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}], + # For tests comment this + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["split_iname", ["j", 5], {outer_tag: "for", inner_tag: "for"}], + #["add_prefetch", ["diff_mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"] + ], + 35:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1", slabs:[0,0]}], + ["split_iname", ["idof_inner", 35], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["split_iname", ["j", 35], {outer_tag: "for", inner_tag: "for"}], + ["add_inames_for_unused_hw_axes"] + ], + 56:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag:"l.1", slabs:[0,0]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}], + ["split_iname", ["j", 56], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + #["add_prefetch", ["diff_mat", "imatrix,idof_inner,j_outer,j_inner"], {temporary_name: "matf", default_tag: "l.auto"}], + ["add_inames_for_unused_hw_axes"] + #["prioritize_loops", ["iel_outer,iel_inner_outer,iel_inner_inner,imatrix,j_outer,j_inner"]] + ] + 84:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + ["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + ["add_inames_for_unused_hw_axes"] + ], + 120:[ + ["tag_inames", [[["imatrix", "ilp"]]]], + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["diff_mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + } + 1d7cab16-19bd-4474-95f2-44ed1c0e60df: {} +} diff --git a/grudge/loopy_dg_kernels/elwise_linear_transform.hjson b/grudge/loopy_dg_kernels/elwise_linear_transform.hjson new file mode 100644 index 000000000..b67c13a81 --- /dev/null +++ b/grudge/loopy_dg_kernels/elwise_linear_transform.hjson @@ -0,0 +1,185 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: { + description: "Transformations for the NVIDIA Titan V" + # 64-bit or 32-bit kernel + FP32:{ + # Polynomial order + 2:[ + # Format: [Transformation, args, kwargs] + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 3:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + #["split_iname", ["idof", 20], {outer_tag: "g.1"}], + #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 4:[ + # Move this to array context? + #["tag_array_axes", ["mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ], + 5:[ + ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + 6:[ + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + # Not optimized, just copied from 32 bit version + FP64: { + 10:[ + # Format: [Transformation, args, kwargs] + #["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + #["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", slabs:[0,0]}], + ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1", slabs:[0,0]}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for"}], + ["add_inames_for_unused_hw_axes"], + ], + 20:[ + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + # For tests uncomment this + #["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,0]}], + #["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["idof", 20], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + # For tests comment this + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag:"l.auto"}], + #["split_iname", ["vec_dim_1_outer", "20"], {outer_tag:"g.1"}], + #["split_iname", ["vec_dim_1_outer", "20"], {outer_tag:"g.1", inner_tag:"ilp", slabs:[0,0]}], + #["tag_inames", [[ ["vec_dim_0_inner", "l.0"], + # ["vec_dim_1_inner", "l.1"], + # ["vec_dim_1_outer","ilp"], + # ["vec_dim_0_outer","ilp"]]]], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ["split_iname", ["j", 20], {outer_tag: "for", inner_tag: "for", slabs:[0,0]}], + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"], + ], + 35:[ + # Move this to array context? + #["tag_array_axes", ["mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["split_iname", ["iel", 56], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + # See if these pass the tests + #["split_iname", ["iel", 12], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 4], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["idof", 35], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], + + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["tag_array_axes", ["vecf", "f,f"]], + ["split_iname", ["j", 35], {outer_tag: "for", inner_tag: "for"}], + + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ["add_inames_for_unused_hw_axes"], + ], + 56:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 8], {outer_tag: "ilp", inner_tag: "l.1"}], + + #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + #["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + ] + 84:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 84], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 12], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 120:[ + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}], + + #["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + } + 1d7cab16-19bd-4474-95f2-44ed1c0e60df: {} +} diff --git a/grudge/loopy_dg_kernels/face_mass_transform.hjson b/grudge/loopy_dg_kernels/face_mass_transform.hjson new file mode 100644 index 000000000..84632357b --- /dev/null +++ b/grudge/loopy_dg_kernels/face_mass_transform.hjson @@ -0,0 +1,170 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: { + description: "Transformations for the NVIDIA Titan V" + # 64-bit or 32-bit kernel + FP32:{ + # Polynomial order + 2:[ + # Format: [Transformation, args, kwargs] + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 3:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + #["split_iname", ["idof", 20], {outer_tag: "g.1"}], + #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 4:[ + # Move this to array context? + #["tag_array_axes", ["mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ], + 5:[ + ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + 6:[ + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + # Not optimized, just copied from 32 bit version + FP64: { + 2:[ + # Format: [Transformation, args, kwargs] + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"] + ], + 3:[ + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}], + # For tests uncomment this + #["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,0]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 4], {outer_tag: "ilp", inner_tag: "l.1"}], + # For tests comment this + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["split_iname", ["j", 10], {outer_tag: "for", inner_tag: "for", slabs:[0,0]}], + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"] + ], + 4:[ + # Move this to array context? + #["tag_array_axes", ["mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + # See if these pass the tests + #["split_iname", ["iel", 12], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 4], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["idof", 35], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], + + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ["tag_array_axes", ["vecf", "f,f"]], + ["add_inames_for_unused_hw_axes"] + ], + 5:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 8], {outer_tag: "ilp", inner_tag: "l.1"}], + + #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + #["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + ] + 6:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 84], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 12], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}], + + #["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + } + 1d7cab16-19bd-4474-95f2-44ed1c0e60df: {} +} diff --git a/grudge/loopy_dg_kernels/generators.py b/grudge/loopy_dg_kernels/generators.py new file mode 100644 index 000000000..07a94e7f3 --- /dev/null +++ b/grudge/loopy_dg_kernels/generators.py @@ -0,0 +1,625 @@ +import numpy as np +from grudge.grudge_tags import (IsDOFArray, IsSepVecDOFArray, + IsOpArray, IsSepVecOpArray, IsFaceDOFArray, IsFaceMassOpArray, + IsVecDOFArray, IsVecOpArray, IsFourAxisDOFArray) + +def k_inner_inner_options(start_val=None): + #options = [8, 16, 4, 32] + options = [32, 16, 8] + start_ind = 0 if start_val is None else options.index(start_val) + options = options[start_ind:] + return options + + +def k_inner_outer_options(n_in, k_inner_inner, sm_size, + fp_bytes=8, start_val=None, nelem=None): + ilp_limit = min(nelem // k_inner_inner, 6) if nelem is not None else 6 + # Possibilities limited by size of local memory + # Use sm_size - 1 because CUDA errors when all of local memory is used + options = np.arange(1, ((sm_size - 1) // (fp_bytes*k_inner_inner*n_in)) + 1) + #Arbitrarily limit to at max 6 inline to limit search space + options = list(k_inner_inner*options[options <= ilp_limit]) + start_ind = 0 if start_val is None else options.index(start_val) + options = options[start_ind:] + return options + +def i_inner_inner_options(n_out, k_inner_inner, max_work_group_size=1024, start_val=None): + factors = np.arange(1, n_out+1)[(n_out % np.arange(1, n_out+1)) == 0] + # Fix for AMD + #factors = np.arange(3, n_out+1)[(n_out % np.arange(2, n_out+1)) == 0] + # Ensure total number of workitems is less than maximum + usable_factors = factors[factors*k_inner_inner <= max_work_group_size] + options = sorted(usable_factors, reverse=True) + start_ind = 0 if start_val is None else options.index(start_val) + options = options[start_ind:] + return options + +def i_inner_outer_options(n_out, i_inner_inner, start_val=None): + # Select a number of inline blocks such that n_out % outer*inner == 0 + # Bumping up the start of the range could reduce autotune time, but an empty + # autotune set might be returned if i < start value + + # Loopy confused about the number of dimensions when + # i_outer, i_inner_outer, and i_inner_inner are all 1 + inline = [1] if n_out == 1 else np.arange(2, (n_out // i_inner_inner) + 1) + options = list(i_inner_inner*inline[n_out % (inline*i_inner_inner) == 0]) + start_ind = 0 if start_val is None else options.index(start_val) + options = options[start_ind:] + return options + + +def j_inner_options(n_in, start_val=None): + + start = 1 + factors = list(np.arange(start, n_in + 1)[(n_in % np.arange(start, n_in + 1)) == 0]) + #factors = list(np.arange(1, n_in + 1)[(n_in % np.arange(1, n_in + 1)) == 0]) + # Should this be limited by the number of registers + start_ind = 0 if start_val is None else factors.index(start_val) + factors = factors[start_ind:] + return factors + +# Creates a list containing tuples of search space parameters. +# Will need to create separate ones of this for each einsum kernel +def gen_autotune_list(queue, knl, start_param=None): + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + nfaces = 1 + + n_in = None + print(knl.default_entrypoint.name) + ndof_arrays = 0 + for arg in knl.default_entrypoint.args: + print(arg.name) + if "resample_by_mat" not in knl.default_entrypoint.name: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + ndof_arrays += 1 + elif IsSepVecOpArray() in arg.tags: + n_mat, n_out, n_in = arg.shape + elif IsOpArray() in arg.tags: + n_out, n_in = arg.shape + elif IsFaceDOFArray() in arg.tags: + nfaces, n_elem, n_in = arg.shape + else: + if IsOpArray() in arg.tags: + n_out, n_in = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + ndof_arrays = max(ndof_arrays, 1) + if n_in is None: + n_in = n_out + + n_in = n_in * nfaces #Prevents shared memory from overflowing in face mass kernel + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s, ji_s = start_param + else: + kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None) + + # Iterate over five search dimensions + # Maybe there is a way to use islpy to do this? + parameter_list = [] + for kii in k_inner_inner_options(start_val=kii_s): + # Should come up with a way to set the effective local memory size. It depends on the number of + # arrays actually prefetched. + for kio in k_inner_outer_options(n_in*nfaces, kii, local_mem_size // ndof_arrays, fp_bytes=fp_bytes,start_val=kio_s): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + # Kernel does not reach here. + iio_s = None + for ji in j_inner_options(n_in, start_val=ji_s): + ji_s = None + choices = (kio, kii, iio, iii, ji) + parameter_list.append(choices) + + return parameter_list + + +# Should separate this so don't need to supply knl +def mxm_trans_list_generator(params, **kwargs): + trans_list = [] + kio, kii, iio, iii, ji = params + knl = kwargs["knl"] + + + #if "diff" in knl.default_entrypoint.name: + # trans_list.append(["tag_inames", ["imatrix: ilp"]]) + + trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["iel_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["idof_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + + if knl.default_entrypoint.name == "face_mass": + pass + #trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"], + # {"temporary_name":"vecf", "default_tag":"l.auto"}]) + #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]]) + #elif knl.default_entrypoint.name == "nodes": + elif knl.default_entrypoint.name == "lp_nodes": + trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + elif "resample_by_mat" in knl.default_entrypoint.name: + # Indirection may prevent prefetching + pass + else: + trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + trans_list.append(["add_inames_for_unused_hw_axes"]) + return trans_list + + +def grudge_elementwise_sum_knl_tlist_generator(params, **kwargs): + trans_list = [] + kio, kii, iio, iii, ji = params + knl = kwargs["knl"] + + trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["iel_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["idof_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + # Should the i loop have (0,1) slabs for both? + + #trans_list.append(["add_prefetch", ["operand", "iel_inner_outer,iel_inner_inner"], + # {"temporary_name":"operandf", "default_tag":"l.auto"}]) + #trans_list.append(["tag_array_axes", ["operandf", "f,f"]]) + + # Realistically, splitting the j loop probably is not necessary for this. + trans_list.append(["split_iname", ["jdof", ji], {"outer_tag":"for", "inner_tag":"for"}]) + trans_list.append(["add_inames_for_unused_hw_axes"]) + return trans_list + +def grudge_elementwise_sum_knl_pspace_generator(queue, knl, start_param=None): + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + + for arg in knl.default_entrypoint.args: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + n_in = n_out + fp_bytes = arg.dtype.dtype.itemsize + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s, ji_s = start_param + else: + kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None) + + # Iterate over five search dimensions. Could reduce this to 4 if ignore j-loop. + parameter_list = [] + if n_elem > 0: + for kii in k_inner_inner_options(start_val=kii_s): + # Both jac and vec are prefetched so the available local_memory per prefetched array is halved + for kio in k_inner_outer_options(n_in, kii, local_mem_size, fp_bytes=fp_bytes,start_val=kio_s): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + iio_s = None + for ji in j_inner_options(n_in, start_val=ji_s): + ji_s = None + choices = (kio, kii, iio, iii, ji) + parameter_list.append(choices) + + return parameter_list + + +def einsum3to2_kernel_tlist_generator(params, **kwargs): + trans_list = [] + kio, kii, iio, iii, ji, lm_layout = params + if 0 not in params: # If there is a zero length dimension then don't transform + knl = kwargs["knl"] + + if kio != kii: + trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["e_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + prefetch_str = "j,e_inner_outer,e_inner_inner" + else: + trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "inner_tag": "l.0", "slabs":(0,0)}]) + prefetch_str = "j,e_inner" + if iio != iii: + trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["i_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + else: + trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "inner_tag": "l.1", "slabs":(0,0)}]) + # Should the i loop have (0,1) slabs for both? + + for arg in knl.default_entrypoint.args: + + if "vec" == arg.name: + trans_list.append(["add_prefetch", ["vec", prefetch_str], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", lm_layout]]) + elif "jac" == arg.name: + trans_list.append(["add_prefetch", ["jac", prefetch_str], + {"temporary_name":"jacf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["jacf", lm_layout]]) + elif "arg2" == arg.name and IsDOFArray() in arg.tags: + trans_list.append(["add_prefetch", ["arg2", prefetch_str], + {"temporary_name":"arg2f", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["arg2f", lm_layout]]) + elif "arg1" == arg.name and IsDOFArray() in arg.tags: + trans_list.append(["add_prefetch", ["arg1", prefetch_str], + {"temporary_name":"arg1f", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["arg1f", lm_layout]]) + elif "arg0" == arg.name and IsDOFArray() in arg.tags: + arg0_prefetch_str = "i_inner," if iio == iii else "i_inner_outer,i_inner_inner," + arg0_prefetch_str += "e_inner" if kio == kii else "e_inner_outer,e_inner_inner" + trans_list.append(["add_prefetch", + ["arg0", arg0_prefetch_str], + {"temporary_name":"arg0f", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["arg0f", lm_layout]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + + trans_list.append(["add_inames_for_unused_hw_axes"]) + return trans_list + +def einsum3to2_kernel_pspace_generator(queue, knl, start_param=None): + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + + n_dof_arrays = 0 + for arg in knl.default_entrypoint.args: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + n_dof_arrays += 1 + elif IsOpArray() in arg.tags: + n_out, n_in = arg.shape + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s, ji_s, lm_layout = start_param + else: + kio_s, kii_s, iio_s, iii_s, ji_s, lm_layout = (None, None, None, None, None, None) + + # Iterate over six search dimensions + parameter_list = [] + + if n_elem*n_out <= 1024: + choices = (n_elem, n_elem, n_out, n_out, n_in, "c,c") + parameter_list.append(choices) + choices = (n_elem, n_elem, n_out, n_out, n_in, "f,f") + parameter_list.append(choices) + else: + for kii in k_inner_inner_options(start_val=kii_s): + # Both jac and vec are prefetched so the available local_memory per prefetched array is halved + # Should check if jac is present + for kio in k_inner_outer_options(n_in, kii, local_mem_size // n_dof_arrays, + fp_bytes=fp_bytes,start_val=kio_s,nelem=n_elem): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + iio_s = None + for ji in j_inner_options(n_in, start_val=ji_s): + ji_s = None + for lm_layout in ["f,f", "c,c"]: + choices = (kio, kii, iio, iii, ji, lm_layout) + parameter_list.append(choices) + + return parameter_list + + +def einsum2to2_kernel_tlist_generator(params, **kwargs): + trans_list = [] + kio, kii, iio, iii = params + knl = kwargs["knl"] + + if knl.default_entrypoint.name == "resample_by_picking_single_indirection": + trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["iel_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["idof_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + else: + trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["e_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["i_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + # Should the i loop have (0,1) slabs for both? + + # Prefetching probably matters not for this kernel + #trans_list.append(["add_prefetch", ["arg1", "e_inner_outer,e_inner_inner,i_inner_outer,i_inner_inner"], + # {"temporary_name":"arg1f", "default_tag":"l.auto"}]) + #trans_list.append(["tag_array_axes", ["arg1f", "f,f"]]) + + trans_list.append(["add_inames_for_unused_hw_axes"]) + return trans_list + + +def einsum2to2_kernel_pspace_generator(queue, knl, start_param=None): + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + + n_elem = None + n_out = None + for arg in knl.default_entrypoint.args: + if IsDOFArray() in arg.tags: + if n_elem is None: + n_elem, n_out = arg.shape + else: # Needed to handle resample_by_picking_group + n_elem = min(arg.shape[0], n_elem) + n_out = min(arg.shape[1], n_out) + n_in = n_out + fp_bytes = arg.dtype.dtype.itemsize + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s = start_param + else: + kio_s, kii_s, iio_s, iii_s = (None, None, None, None) + + # Iterate over five search dimensions + parameter_list = [] + if n_elem > 0: + for kii in k_inner_inner_options(start_val=kii_s): + for kio in k_inner_outer_options(n_in, kii, local_mem_size, fp_bytes=fp_bytes,start_val=kio_s): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + iio_s = None + #for ji in j_inner_options(n_in, start_val=ji_s): + # ji_s = None + choices = (kio, kii, iio, iii) + parameter_list.append(choices) + + return parameter_list + + +def einsum4to2_face_mass_kernel_tlist_generator(params, **kwargs): + trans_list = [] + kio, kii, iio, iii, ji = params + + trans_list.append(["tag_inames", ["f: unr"]]) + trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["e_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["i_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + # Should the i loop have (0,1) slabs for both? + + trans_list.append(["add_prefetch", ["vec", "f,j,e_inner_outer,e_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "N2,N0,N1"]]) + + trans_list.append(["add_prefetch", ["jac_surf", "f,j,e_inner_outer,e_inner_inner"], + {"temporary_name":"jac_surff", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["jac_surff", "N2,N0,N1"]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + + trans_list.append(["add_inames_for_unused_hw_axes"]) + + return trans_list + +""" +def einsum4to2_face_mass_kernel_pspace_generator(queue, knl, start_param=None): + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + + for arg in knl.default_entrypoint.args: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsVecDOFArray() in arg.tags: + n_r, n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsVecOpArray() in arg.tags: + n_r, n_out, n_in = arg.shape + elif IsFaceMassOpArray() in arg.tags: + n_out, n_r, n_in = arg.shape + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s, ji_s = start_param + else: + kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None) + + # Iterate over five search dimensions + parameter_list = [] + for kii in k_inner_inner_options(start_val=kii_s): + # Both inv_jac_t and vec are prefetched so the amount of available local memory per array is reduced + for kio in k_inner_outer_options(n_in, kii, local_mem_size // (n_r + 1), fp_bytes=fp_bytes,start_val=kio_s): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + iio_s = None + for ji in j_inner_options(n_in, start_val=ji_s): + ji_s = None + choices = (kio, kii, iio, iii, ji) + parameter_list.append(choices) + + return parameter_list +""" + + +def einsum4to2_kernel_tlist_generator(params, **kwargs): + trans_list = [] + kio, kii, iio, iii, ji, o = params + knl = kwargs["knl"] + arg_names = {arg.name for arg in knl.default_entrypoint.args} + inames = knl.default_entrypoint.inames.keys() + + if "r" in inames: + trans_list.append(["tag_inames", ["r: unr"]]) + if "f" in inames: + trans_list.append(["tag_inames", ["f: unr"]]) + + + trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["e_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["i_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + # Should the i loop have (0,1) slabs for both? + + #trans_list.append(["add_prefetch", ["vec", "j,e_inner_outer,e_inner_inner"], + # {"temporary_name":"vecf", "default_tag":"l.auto"}]) + #trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + if "inv_jac_t" in arg_names: + trans_list.append(["add_prefetch", ["vec", "j,e_inner_outer,e_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "N0,N1" if o == "F" else "N1,N0"]]) + + trans_list.append(["add_prefetch", ["inv_jac_t", "r,j,e_inner_outer,e_inner_inner"], + {"temporary_name":"inv_jac_tf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["inv_jac_tf", "N2,N0,N1" if o == "F" else "N2,N1,N0"]]) + elif "jac_surf" in arg_names: + trans_list.append(["add_prefetch", ["vec", "f,j,e_inner_outer,e_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + # See if N2,N0,N1 works for "F" order, may need to change it in the array context + trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2" if o =="F" else "N2,N1,N0"]]) + + trans_list.append(["add_prefetch", ["jac_surf", "f,j,e_inner_outer,e_inner_inner"], + {"temporary_name":"inv_jac_tf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["inv_jac_tf", "N1,N0,N2" if o == "F" else "N2,N1,N0"]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + trans_list.append(["add_inames_for_unused_hw_axes"]) + return trans_list + +def einsum4to2_kernel_pspace_generator(queue, knl, start_param=None): + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + lmem_divisor = 0 + + for arg in knl.default_entrypoint.args: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsVecDOFArray() in arg.tags: + n_r, n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsFaceDOFArray() in arg.tags: + n_r, n_elem, n_in = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsVecOpArray() in arg.tags: + n_r, n_out, n_in = arg.shape + lmem_divisor = n_r + 1 + elif IsFaceMassOpArray() in arg.tags: + n_out, n_r, n_in = arg.shape + lmem_divisor = 2*n_r + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s, ji_s = start_param + else: + kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None) + + # Iterate over five search dimensions + parameter_list = [] + if n_elem > 0: + for kii in k_inner_inner_options(start_val=kii_s): + # Both inv_jac_t and vec are prefetched so the amount of available local memory per array is reduced + for kio in k_inner_outer_options(n_in, kii, local_mem_size // lmem_divisor, fp_bytes=fp_bytes,start_val=kio_s): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + iio_s = None + for ji in j_inner_options(n_in, start_val=ji_s): + ji_s = None + for order in ["F","C"]: + choices = (kio, kii, iio, iii, ji,order) + parameter_list.append(choices) + + return parameter_list + + +def einsum5to3_kernel_tlist_generator(params, **kwargs): + trans_list = [] + kio, kii, iio, iii, ji, lm_ord = params + if lm_ord in "fF": + vecf_ord = "f,f" + inv_jac_tf_ord = "N3,N2,N0,N1" + else: + vecf_ord = "c,c" + inv_jac_tf_ord = "N3,N2,N1,N0" + trans_list.append(["tag_inames", ["r: unr"]]) + trans_list.append(["tag_inames", ["x: ilp"]]) + trans_list.append(["split_iname", ["e", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["e_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["i", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["i_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + # Should the i loop have (0,1) slabs for both? + + trans_list.append(["add_prefetch", ["vec", "j,e_inner_outer,e_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", vecf_ord]]) + trans_list.append(["add_prefetch", ["inv_jac_t", "x,r,j,e_inner_outer,e_inner_inner"], + {"temporary_name":"inv_jac_tf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["inv_jac_tf", inv_jac_tf_ord]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + trans_list.append(["add_inames_for_unused_hw_axes"]) + return trans_list + +def einsum5to3_kernel_pspace_generator(queue, knl, start_param=None): + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + + for arg in knl.default_entrypoint.args: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsFourAxisDOFArray() in arg.tags: + n_r, n_x, n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsVecOpArray() in arg.tags: + n_r, n_out, n_in = arg.shape + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s, ji_s, order = start_param + else: + kio_s, kii_s, iio_s, iii_s, ji_s, order = (None, None, None, None, None, None) + + # Iterate over five search dimensions + parameter_list = [] + if n_elem > 0: + for kii in k_inner_inner_options(start_val=kii_s): + # Both inv_jac_t and vec are prefetched so the amount of available local memory per array is reduced + for kio in k_inner_outer_options(n_in, kii, local_mem_size // (n_r*n_x + 1), fp_bytes=fp_bytes,start_val=kio_s): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + iio_s = None + for ji in j_inner_options(n_in, start_val=ji_s): + ji_s = None + for order in ["F", "C"]: + choices = (kio, kii, iio, iii, ji, order) + parameter_list.append(choices) + + return parameter_list diff --git a/grudge/loopy_dg_kernels/parallel_autotuning.py b/grudge/loopy_dg_kernels/parallel_autotuning.py new file mode 100644 index 000000000..5f99a9760 --- /dev/null +++ b/grudge/loopy_dg_kernels/parallel_autotuning.py @@ -0,0 +1,113 @@ +from charm4py import charm, Chare, Array, Reducer, Future +import pyopencl as cl +import numpy as np +import grudge.loopy_dg_kernels as dgk +#from grudge.execution import diff_prg, elwise_linear + +class AutotuneTask(Chare): + + def __init__(self, platform_id, params): + self.platform_id = platform_id + self.params = params + + def get_queue(self): + platform = cl.get_platforms() + gpu_devices = platform[self.platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + return queue + + def run(self): + print([self.params, np.random.rand]) + + +class Test(Chare): + def start(self): + print('I am element', self.thisIndex, 'on PE', charm.myPe(), + 'sending a msg to element 1') + self.thisProxy[1].sayHi() + + #@coro + def sayHi(self, future): + rn = np.random.rand() + print('Hello from element', self.thisIndex, 'on PE', charm.myPe(), 'random', rn) + self.reduce(future, rn, Reducer.max) + +def get_queue(pe_num, platform_num=0): + platforms = cl.get_platforms() + gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU) + ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]]) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + return queue + #return gpu_devices[pe_num % len(gpu_devices)].int_ptr + +def do_work(args): + params = args[0] + knl = args[1] + queue = get_queue(charm.myPe()) + print("PE: ", charm.myPe()) + avg_time, transform_list = dgk.run_tests.apply_transformations_and_run_test(queue, knl, dgk.run_tests.generic_test, params) + return avg_time, params + +def square(x): + return x**2 + + +def main(args): + + # Create queue, assume all GPUs on the machine are the same + """ + platforms = cl.get_platforms() + platform_id = 0 + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + assert charm.numPes() > 1 + assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + + from grudge.execution import diff_prg, elwise_linear_prg + knl = diff_prg(3, 1000000, 10, np.float64) + params = dgk.run_tests.gen_autotune_list(queue, knl) + + args = [[param, knl] for param in params] + + # May help to balance workload + from random import shuffle + shuffle(args) + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + result = charm.pool.map(do_work, args) + sort_key = lambda entry: entry[0] + result.sort(key=sort_key) + + + for r in result: + print(r) + + #knl = diff_prg(3, 100000, 56, np.float64) + #autotune_list = gen_autotune_list(queue, knl) + #print(autotune_list) + + """ + + print(charm.numHosts(), charm.numPes()) + f = Future() + #a = Array(Test, a.numPes()) + #a.sayHi(f) + #result = f.get() + #print(result) + print("All finished") + charm.exit() + +charm.start(main) diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_charm4py.py b/grudge/loopy_dg_kernels/parallel_autotuning_charm4py.py new file mode 100644 index 000000000..005d3bec2 --- /dev/null +++ b/grudge/loopy_dg_kernels/parallel_autotuning_charm4py.py @@ -0,0 +1,249 @@ +from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm +from charm4py.pool import PoolScheduler, Pool +from charm4py.charm import Charm, CharmRemote +#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap +#from charm4py.sections import SectionManager +#import inspect +#import sys +import hjson +import pyopencl as cl +import numpy as np +import grudge.loopy_dg_kernels as dgk +import os +import grudge.grudge_array_context as gac +import loopy as lp +from os.path import exists +from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test +from grudge.grudge_array_context import convert +#from grudge.execution import diff_prg, elwise_linear + +# Makes one PE inactive on each host so the number of workers is the same on all hosts as +# opposed to the basic PoolScheduler which has one fewer worker on the host with PE 0. +# This can be useful for running tasks on a GPU cluster for example. +class BalancedPoolScheduler(PoolScheduler): + + def __init__(self): + super().__init__() + n_pes = charm.numPes() + n_hosts = charm.numHosts() + pes_per_host = n_pes // n_hosts + + assert n_pes % n_hosts == 0 # Enforce constant number of pes per host + assert pes_per_host > 1 # We're letting one pe on each host be unused + + self.idle_workers = set([i for i in range(n_pes) if not i % pes_per_host == 0 ]) + self.num_workers = len(self.idle_workers) + +# Use all PEs including PE 0 +class AllPEsPoolScheduler(PoolScheduler): + + def __init__(self): + super().__init__() + n_pes = charm.numPes() + n_hosts = charm.numHosts() + + self.idle_workers = set(range(n_pes)) + self.num_workers = len(self.idle_workers) + + +def get_queue(pe_num, platform_num): + platforms = cl.get_platforms() + gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU) + ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]]) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + return queue + +# Just assume each rank has one processor and create a queue +# Breaks for some reason. Maybe because the tasks migrate and the underlying hardware +# address changes so the queue is not for the correct device. +# The memory will probably run out over time if many queues are created. +queue = get_queue(0,0) + +def test(args): + platform_id, knl, tlist_generator, params, test_fn = args + #queue = get_queue(charm.myPe(), platform_id) + result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) + return result + + +def unpickle_kernel(fname): + from pickle import load + f = open(fname, "rb") + program = load(f) + f.close() + return program + +def autotune_pickled_kernels(path, platform_id, actx_class, comm): + from os import listdir + dir_list = listdir(path) + for f in dir_list: + if f.endswith(".pickle"): + fname = path + "/" + f + print("===============================================") + print("Autotuning", fname) + knl = unpickle_kernel(fname) + knl_id = f.split(".")[0] + knl_id = knl_id.split("_")[-1] + print("Kernel ID", knl_id) + print("New kernel ID", gac.unique_program_id(knl)) + + assert knl_id == gac.unique_program_id(knl) + knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + assert knl_id == gac.unique_program_id(knl) + + print(knl) + pid = gac.unique_program_id(knl) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + if not exists(hjson_file_str): + parallel_autotune(knl, platform_id, actx_class, comm) + else: + print("hjson file exists, skipping") + +def parallel_autotune(knl, platform_id, actx_class, comm): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + + import pyopencl.tools as cl_tools + actx = actx_class( + comm, + queue, + allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) + + #knl = gac.fix_program_parameters(knl) + #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + pid = gac.unique_program_id(knl) + os.makedirs(os.getcwd() + "/hjson", exist_ok=True) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + + + assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from run_tests import run_single_param_set + + tlist_generator, pspace_generator = actx.get_generators(knl) + params_list = pspace_generator(actx.queue, knl) + + # Could make a massive list with all kernels and parameters + args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list] + + + # May help to balance workload + # Should test if shuffling matters + from random import shuffle + shuffle(args) + + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work + + pool_proxy = Chare(PoolScheduler, onPE=0) + mypool = Pool(pool_proxy) + if len(args) > 0: # Guard against empty list + results = mypool.map(test, args) + + sort_key = lambda entry: entry[0] + results.sort(key=sort_key) + + #for r in results: + # print(r) + # Workaround for pocl CUDA bug + # whereby times are imprecise + ret_index = 0 + for i, result in enumerate(results): + if result[0] > 1e-7: + ret_index = i + break + + avg_time, transformations, data = results[ret_index] + else: + transformations = {} + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + hjson.dump(od, out_file,default=convert) + out_file.close() + + return transformations + +""" +def main(args): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + platform_id = 0 + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from grudge.execution import diff_prg, elwise_linear_prg + knl = diff_prg(3, 1000000, 3, np.float64) + params = dgk.run_tests.gen_autotune_list(queue, knl) + + args = [[param, knl] for param in params] + + # May help to balance workload + from random import shuffle + shuffle(args) + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + pool_proxy = Chare(BalancedPoolScheduler, onPE=0) + mypool = Pool(pool_proxy) + result = mypool.map(do_work, args) + + sort_key = lambda entry: entry[0] + result.sort(key=sort_key) + + + for r in result: + print(r) +""" + +def main(args): + import mpi4py.MPI as MPI + from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac + comm = MPI.COMM_WORLD + + autotune_pickled_kernels("./pickled_programs", 0, Maac, comm) + print("DONE!") + exit() + +def charm_autotune(): + charm.start(main) + print(result) + charm.exit() + +if __name__ == "__main__": + charm.start(main) + print(result) + charm.exit() diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_mpi4py.py b/grudge/loopy_dg_kernels/parallel_autotuning_mpi4py.py new file mode 100644 index 000000000..1fd34128e --- /dev/null +++ b/grudge/loopy_dg_kernels/parallel_autotuning_mpi4py.py @@ -0,0 +1,308 @@ +#from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm +#from charm4py.pool import PoolScheduler, Pool +#from charm4py.charm import Charm, CharmRemote +#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap +#from charm4py.sections import SectionManager +#import inspect +#import sys +import hjson +import pyopencl as cl +import numpy as np +import grudge.loopy_dg_kernels as dgk +import os +import grudge.grudge_array_context as gac +import loopy as lp +from os.path import exists +from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test +from grudge.grudge_array_context import convert +#from grudge.execution import diff_prg, elwise_linear +import mpi4py.MPI as MPI +from mpi4py.futures import MPIPoolExecutor, MPICommExecutor +#from mpipool import MPIPool + +#from guppy import hpy +#import gc +#import linecache +#import os +#import tracemalloc +#from mem_top import mem_top +#import matplotlib.pyplot as plt + +data_dict = {} + +def display_top(snapshot, key_type='lineno', limit=10): + snapshot = snapshot.filter_traces(( + tracemalloc.Filter(False, ""), + tracemalloc.Filter(False, ""), + tracemalloc.Filter(False, ""), + )) + top_stats = snapshot.statistics(key_type) + + print("Top %s lines" % limit) + for index, stat in enumerate(top_stats[:limit], 1): + frame = stat.traceback[0] + # replace "/path/to/module/file.py" with "module/file.py" + filename = os.sep.join(frame.filename.split(os.sep)[-2:]) + print("#%s: %s:%s: %.1f KiB" + % (index, filename, frame.lineno, stat.size / 1024)) + line = linecache.getline(frame.filename, frame.lineno).strip() + d_str = filename + ":" + str(frame.lineno) + ": " + line + if d_str not in data_dict: + data_dict[d_str] = [stat.size] + else: + data_dict[d_str].append(stat.size) + + if line: + print(' %s' % line) + + fig = plt.figure(0) + fig.clear() + plt.ion() + plt.show() + dlist = sorted(data_dict.items(), key=lambda a: a[1][-1], reverse=True)[:10] + #print(dlist) + #exit() + for key, vals in dlist: + plt.plot(vals, label=key + " " + str(vals[-1]) + " bytes") + plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=False, ncol=1) + plt.draw() + #plt.pause(1) + plt.savefig("memory_usage.png", bbox_inches="tight") + + other = top_stats[limit:] + if other: + size = sum(stat.size for stat in other) + print("%s other: %.1f KiB" % (len(other), size / 1024)) + total = sum(stat.size for stat in top_stats) + print("Total allocated size: %.1f KiB" % (total / 1024)) + + + +def get_queue(pe_num, platform_num): + platforms = cl.get_platforms() + gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU) + ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]]) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + return queue + +# Assume using platform zero +comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future +# From MPI.PoolExecutor the communicator for the tasks is not COMM_WORLD +queue = get_queue(comm.Get_rank(), 0) + +def test(args): + platform_id, knl, tlist_generator, params, test_fn = args + #comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future + # From MPI.PoolExecutor the communicator for the tasks is not COMM_WORLD + #queue = get_queue(comm.Get_rank(), platform_id) + result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) + #print(mem_top()) + #h = hpy() + #print(h.heap()) + #snapshot = tracemalloc.take_snapshot() + #display_top(snapshot) + #del knl + #del args + + #result = [10,10,10] + return result + +def unpickle_kernel(fname): + from pickle import load + f = open(fname, "rb") + program = load(f) + f.close() + return program + + +def autotune_pickled_kernels(path, platform_id, actx_class, comm): + from os import listdir + dir_list = listdir(path) + for f in dir_list: + if f.endswith(".pickle"): + fname = path + "/" + f + print("===============================================") + print("Autotuning", fname) + knl = unpickle_kernel(fname) + knl_id = f.split(".")[0] + knl_id = knl_id.split("_")[-1] + + #assert knl_id == gac.unique_program_id(knl) + + print("Kernel ID", knl_id) + print("Calculated Kernel ID", gac.unique_program_id(knl)) + # These should be baked into the kernel object already? + #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + #knl = gac.set_memory_layout(knl) + #print("New kernel ID", gac.unique_program_id(knl)) + + assert knl_id == gac.unique_program_id(knl) + + print(knl) + #pid = gac.unique_program_id(knl) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{knl_id}.hjson" + if not exists(hjson_file_str): + parallel_autotune(knl, platform_id, actx_class, comm) + else: + print("hjson file exists, skipping") + + +def parallel_autotune(knl, platform_id, actx_class, comm): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + # Should just use get_queue + ctx = cl.Context(devices=[gpu_devices[comm.Get_rank() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + + import pyopencl.tools as cl_tools + actx = actx_class( + comm, + queue, + allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) + + #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + #knl = gac.set_memory_layout(knl) + pid = gac.unique_program_id(knl) + os.makedirs(os.getcwd() + "/hjson", exist_ok=True) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + + #assert comm.Get_size() > 1 + #assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from run_tests import run_single_param_set + + tlist_generator, pspace_generator = actx.get_generators(knl) + params_list = pspace_generator(actx.queue, knl) + + # Could make a massive list with all kernels and parameters + args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list] + + # May help to balance workload + # Should test if shuffling matters + #from random import shuffle + #shuffle(args) + + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work + + #pool_proxy = Chare(PoolScheduler, onPE=0) + + sort_key = lambda entry: entry[0] + transformations = {} + comm = MPI.COMM_WORLD + #nranks = comm.Get_size() + if len(params_list) > 0: # Guard against empty list + #executor = MPIPoolExecutor(max_workers=1) + #results = list(executor.map(test, args)) + #results.sort(key=sort_key) + #avg_time, transformations, data = results[0] + #for entry in results: + # print(entry) + #exit() + #""" + with MPICommExecutor(comm, root=0) as mypool: + if mypool is not None: + results = list(mypool.map(test, args, chunksize=1)) + results.sort(key=sort_key) + + #for r in results: + # print(r) + # Workaround for pocl CUDA bug + # whereby times are imprecise + ret_index = 0 + for i, result in enumerate(results): + if result[0] > 1e-7: + ret_index = i + break + + avg_time, transformations, data = results[ret_index] + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + hjson.dump(od, out_file,default=convert) + out_file.close() + #""" + + return transformations + +""" +def main(args): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + platform_id = 0 + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from grudge.execution import diff_prg, elwise_linear_prg + knl = diff_prg(3, 1000000, 3, np.float64) + params = dgk.run_tests.gen_autotune_list(queue, knl) + + args = [[param, knl] for param in params] + + # May help to balance workload + from random import shuffle + shuffle(args) + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + pool_proxy = Chare(BalancedPoolScheduler, onPE=0) + mypool = Pool(pool_proxy) + result = mypool.map(do_work, args) + + sort_key = lambda entry: entry[0] + result.sort(key=sort_key) + + + for r in result: + print(r) +""" + +def main(): + from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac + comm = MPI.COMM_WORLD + + #tracemalloc.start() + #gc.set_debug(gc.DEBUG_UNCOLLECTABLE) + autotune_pickled_kernels("./pickled_programs", 0, Maac, comm) + + print("DONE!") + exit() + +if __name__ == "__main__": + import sys + main() + + #pool = MPIPool() + + #if not pool.is_master(): + # pool.wait() + # sys.exit(0) + diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_mpipool.py b/grudge/loopy_dg_kernels/parallel_autotuning_mpipool.py new file mode 100644 index 000000000..cfdef85fe --- /dev/null +++ b/grudge/loopy_dg_kernels/parallel_autotuning_mpipool.py @@ -0,0 +1,225 @@ +#from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm +#from charm4py.pool import PoolScheduler, Pool +#from charm4py.charm import Charm, CharmRemote +#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap +#from charm4py.sections import SectionManager +#import inspect +#import sys +import hjson +import pyopencl as cl +import numpy as np +import grudge.loopy_dg_kernels as dgk +import os +import grudge.grudge_array_context as gac +import loopy as lp +from os.path import exists +from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test +from grudge.grudge_array_context import convert +#from grudge.execution import diff_prg, elwise_linear +import mpi4py.MPI as MPI +from mpi4py.futures import MPIPoolExecutor, MPICommExecutor +from mpipool import MPIPool + +def get_queue(pe_num, platform_num): + platforms = cl.get_platforms() + gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU) + ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]]) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + return queue + +comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future +queue = get_queue(comm.Get_rank(), 0) + + +def test(args): + #print(args) + platform_id, knl, tlist_generator, params, test_fn = args + result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) + return result + + +def unpickle_kernel(fname): + from pickle import load + f = open(fname, "rb") + program = load(f) + f.close() + return program + + +def autotune_pickled_kernels(path, platform_id, actx_class, comm): + from os import listdir + dir_list = listdir(path) + for f in dir_list: + if f.endswith(".pickle"): + fname = path + "/" + f + print("===============================================") + print("Autotuning", fname) + knl = unpickle_kernel(fname) + knl_id = f.split(".")[0] + knl_id = knl_id.split("_")[-1] + print("Kernel ID", knl_id) + print("New kernel ID", gac.unique_program_id(knl)) + + assert knl_id == gac.unique_program_id(knl) + knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + assert knl_id == gac.unique_program_id(knl) + + print(knl) + pid = gac.unique_program_id(knl) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + if not exists(hjson_file_str): + + parallel_autotune(knl, platform_id, actx_class, comm) + else: + print("hjson file exists, skipping") + + +def parallel_autotune(knl, platform_id, actx_class, comm): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[comm.Get_rank() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + + import pyopencl.tools as cl_tools + actx = actx_class( + comm, + queue, + allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) + + knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + pid = gac.unique_program_id(knl) + os.makedirs(os.path.dirname("./hjson"), exist_ok=True) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + + #assert comm.Get_size() > 1 + #assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from run_tests import run_single_param_set + + tlist_generator, pspace_generator = actx.get_generators(knl) + params_list = pspace_generator(actx.queue, knl) + + # Could make a massive list with all kernels and parameters + args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list] + + + # May help to balance workload + # Should test if shuffling matters + from random import shuffle + shuffle(args) + + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work + + #pool_proxy = Chare(PoolScheduler, onPE=0) + + sort_key = lambda entry: entry[0] + transformations = {} + if len(args) > 0: # Guard against empty list + with MPIPool() as mypool: + mypool.workers_exit() + if mypool is not None: + results = list(mypool.map(test, args)) + results.sort(key=sort_key) + + #for r in results: + # print(r) + # Workaround for pocl CUDA bug + # whereby times are imprecise + ret_index = 0 + for i, result in enumerate(results): + if result[0] > 1e-7: + ret_index = i + break + + avg_time, transformations, data = results[ret_index] + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + hjson.dump(od, out_file,default=convert) + out_file.close() + + return transformations + +""" +def main(args): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + platform_id = 0 + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from grudge.execution import diff_prg, elwise_linear_prg + knl = diff_prg(3, 1000000, 3, np.float64) + params = dgk.run_tests.gen_autotune_list(queue, knl) + + args = [[param, knl] for param in params] + + # May help to balance workload + from random import shuffle + shuffle(args) + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + pool_proxy = Chare(BalancedPoolScheduler, onPE=0) + mypool = Pool(pool_proxy) + result = mypool.map(do_work, args) + + sort_key = lambda entry: entry[0] + result.sort(key=sort_key) + + + for r in result: + print(r) +""" + +def main(): + from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac + comm = MPI.COMM_WORLD + + autotune_pickled_kernels("./pickled_programs", 0, Maac, comm) + + print("DONE!") + exit() + +if __name__ == "__main__": + import sys + main() + + #pool = MPIPool() + + #if not pool.is_master(): + # pool.wait() + # sys.exit(0) + diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_schwimmbad.py b/grudge/loopy_dg_kernels/parallel_autotuning_schwimmbad.py new file mode 100644 index 000000000..0d9b2d572 --- /dev/null +++ b/grudge/loopy_dg_kernels/parallel_autotuning_schwimmbad.py @@ -0,0 +1,247 @@ +#from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm +#from charm4py.pool import PoolScheduler, Pool +#from charm4py.charm import Charm, CharmRemote +#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap +#from charm4py.sections import SectionManager +#import inspect +#import sys +import hjson +import pyopencl as cl +import numpy as np +import grudge.loopy_dg_kernels as dgk +import os +import grudge.grudge_array_context as gac +import loopy as lp +from os.path import exists +from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test +from grudge.grudge_array_context import convert +#from grudge.execution import diff_prg, elwise_linear +import mpi4py.MPI as MPI +from schwimmbad import SerialPool, MPIPool +#from schwimmbad.mpi import MPIAsyncPool + +def get_queue(pe_num, platform_num): + platforms = cl.get_platforms() + gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU) + ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]]) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + return queue + +comm = MPI.COMM_WORLD # Assume we're using COMM_WORLD. May need to change this in the future +queue = get_queue(comm.Get_rank(), 0) + + +def test(args): + platform_id, knl, tlist_generator, params, test_fn = args + result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) + return result + + +def unpickle_kernel(fname): + from pickle import load + f = open(fname, "rb") + program = load(f) + f.close() + return program + + +def autotune_pickled_kernels(path, platform_id, actx_class, comm): + from os import listdir + dir_list = listdir(path) + for f in dir_list: + if f.endswith(".pickle"): + fname = path + "/" + f + print("===============================================") + print("Autotuning", fname) + knl = unpickle_kernel(fname) + knl_id = f.split(".")[0] + knl_id = knl_id.split("_")[-1] + print("Kernel ID", knl_id) + print("New kernel ID", gac.unique_program_id(knl)) + + assert knl_id == gac.unique_program_id(knl) + knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + assert knl_id == gac.unique_program_id(knl) + + print(knl) + pid = gac.unique_program_id(knl) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + if not exists(hjson_file_str): + parallel_autotune(knl, platform_id, actx_class, comm) + else: + print("hjson file exists, skipping") + + +def parallel_autotune(knl, platform_id, actx_class, comm): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[comm.Get_rank() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + + import pyopencl.tools as cl_tools + actx = actx_class( + comm, + queue, + allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) + + knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + pid = gac.unique_program_id(knl) + os.makedirs(os.path.dirname("./hjson"), exist_ok=True) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + + #assert comm.Get_size() > 1 + #assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from run_tests import run_single_param_set + + tlist_generator, pspace_generator = actx.get_generators(knl) + params_list = pspace_generator(actx.queue, knl) + + # Could make a massive list with all kernels and parameters + args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list] + + + # May help to balance workload + # Should test if shuffling matters + from random import shuffle + shuffle(args) + + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work + + #pool_proxy = Chare(PoolScheduler, onPE=0) + #mypool = MPIAsyncPool() + mypool = MPIPool()#Pool(pool_proxy) + #mypool = SerialPool() + if isinstance(mypool, MPIPool) and not mypool.is_master(): + mypool.wait() + sys.exit(0) + + sort_key = lambda entry: entry[0] + if len(args) > 0: # Guard against empty list + results = list(mypool.map(test, args)) + mypool.close() + results.sort(key=sort_key) + + #for r in results: + # print(r) + # Workaround for pocl CUDA bug + # whereby times are imprecise + ret_index = 0 + for i, result in enumerate(results): + if result[0] > 1e-7: + ret_index = i + break + + avg_time, transformations, data = results[ret_index] + else: + transformations = {} + mypool.close() + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + hjson.dump(od, out_file,default=convert) + out_file.close() + + return transformations + +""" +def main(args): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + platform_id = 0 + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from grudge.execution import diff_prg, elwise_linear_prg + knl = diff_prg(3, 1000000, 3, np.float64) + params = dgk.run_tests.gen_autotune_list(queue, knl) + + args = [[param, knl] for param in params] + + # May help to balance workload + from random import shuffle + shuffle(args) + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + pool_proxy = Chare(BalancedPoolScheduler, onPE=0) + mypool = Pool(pool_proxy) + result = mypool.map(do_work, args) + + sort_key = lambda entry: entry[0] + result.sort(key=sort_key) + + + for r in result: + print(r) +""" + +def main(): + from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac + comm = MPI.COMM_WORLD + + autotune_pickled_kernels("./pickled_programs", 0, Maac, comm) + + print("DONE!") + exit() + +""" +def worker(task): + a, b = task + return a**2 + b**2 + +def main(args): + # Here we generate some fake data + import random + a = [random.random() for _ in range(10000)] + b = [random.random() for _ in range(10000)] + + tasks = list(zip(a, b)) + results = pool.map(worker, tasks) + pool.close() + + print(results[:8]) +""" + +if __name__ == "__main__": + import sys + main() + + #pool = MPIPool() + + #if not pool.is_master(): + # pool.wait() + # sys.exit(0) + diff --git a/grudge/loopy_dg_kernels/parallel_autotuning_v2.py b/grudge/loopy_dg_kernels/parallel_autotuning_v2.py new file mode 100644 index 000000000..2dff8c7d4 --- /dev/null +++ b/grudge/loopy_dg_kernels/parallel_autotuning_v2.py @@ -0,0 +1,253 @@ +from charm4py import entry_method, chare, Chare, Array, Reducer, Future, charm +from charm4py.pool import PoolScheduler, Pool +from charm4py.charm import Charm, CharmRemote +#from charm4py.chare import GROUP, MAINCHARE, ARRAY, CHARM_TYPES, Mainchare, Group, ArrayMap +#from charm4py.sections import SectionManager +#import inspect +#import sys +import hjson +import pyopencl as cl +import numpy as np +import grudge.loopy_dg_kernels as dgk +import os +import grudge.grudge_array_context as gac +import loopy as lp +from os.path import exists +from grudge.loopy_dg_kernels.run_tests import run_single_param_set, generic_test +from grudge.grudge_array_context import convert +#from grudge.execution import diff_prg, elwise_linear + +# Makes one PE inactive on each host so the number of workers is the same on all hosts as +# opposed to the basic PoolScheduler which has one fewer worker on the host with PE 0. +# This can be useful for running tasks on a GPU cluster for example. +class BalancedPoolScheduler(PoolScheduler): + + def __init__(self): + super().__init__() + n_pes = charm.numPes() + n_hosts = charm.numHosts() + pes_per_host = n_pes // n_hosts + + assert n_pes % n_hosts == 0 # Enforce constant number of pes per host + assert pes_per_host > 1 # We're letting one pe on each host be unused + + self.idle_workers = set([i for i in range(n_pes) if not i % pes_per_host == 0 ]) + self.num_workers = len(self.idle_workers) + +# Use all PEs including PE 0 +class AllPEsPoolScheduler(PoolScheduler): + + def __init__(self): + super().__init__() + n_pes = charm.numPes() + n_hosts = charm.numHosts() + + self.idle_workers = set(range(n_pes)) + self.num_workers = len(self.idle_workers) + + +def get_queue(pe_num, platform_num): + platforms = cl.get_platforms() + gpu_devices = platforms[platform_num].get_devices(device_type=cl.device_type.GPU) + ctx = cl.Context(devices=[gpu_devices[pe_num % len(gpu_devices)]]) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + return queue + + +def do_work(args): + params = args[0] + knl = args[1] + queue = get_queue(charm.myPe()) + print("PE: ", charm.myPe()) + avg_time, transform_list = dgk.run_tests.apply_transformations_and_run_test(queue, knl, dgk.run_tests.generic_test, params) + return avg_time, params + +def test(args): + platform_id, knl, tlist_generator, params, test_fn = args + queue = get_queue(charm.myPe(), platform_id) + result = run_single_param_set(queue, knl, tlist_generator, params, test_fn) + return result + + + +def unpickle_kernel(fname): + from pickle import load + f = open(fname, "rb") + program = load(f) + f.close() + return program + +def autotune_pickled_kernels(path, platform_id, actx_class, comm): + from os import listdir + dir_list = listdir(path) + for f in dir_list: + if f.endswith(".pickle"): + fname = path + "/" + f + print("===============================================") + print("Autotuning", fname) + knl = unpickle_kernel(fname) + knl_id = f.split(".")[0] + knl_id = knl_id.split("_")[-1] + print("Kernel ID", knl_id) + print("New kernel ID", gac.unique_program_id(knl)) + + assert knl_id == gac.unique_program_id(knl) + knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + assert knl_id == gac.unique_program_id(knl) + + print(knl) + pid = gac.unique_program_id(knl) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + if not exists(hjson_file_str): + parallel_autotune(knl, platform_id, actx_class, comm) + else: + print("hjson file exists, skipping") + +def parallel_autotune(knl, platform_id, actx_class, comm): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + + import pyopencl.tools as cl_tools + actx = actx_class( + comm, + queue, + allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) + + #knl = gac.fix_program_parameters(knl) + #knl = lp.set_options(knl, lp.Options(no_numpy=True, return_dict=True)) + knl = gac.set_memory_layout(knl) + pid = gac.unique_program_id(knl) + os.makedirs(os.getcwd() + "/hjson", exist_ok=True) + hjson_file_str = f"hjson/{knl.default_entrypoint.name}_{pid}.hjson" + + + assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + #assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from run_tests import run_single_param_set + + tlist_generator, pspace_generator = actx.get_generators(knl) + params_list = pspace_generator(actx.queue, knl) + + # Could make a massive list with all kernels and parameters + args = [(platform_id, knl, tlist_generator, p, generic_test,) for p in params_list] + + + # May help to balance workload + # Should test if shuffling matters + from random import shuffle + shuffle(args) + + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + #pool_proxy = Chare(BalancedPoolScheduler, onPE=0) # Need to use own charm++ branch to make work + + pool_proxy = Chare(PoolScheduler, onPE=0) + mypool = Pool(pool_proxy) + if len(args) > 0: # Guard against empty list + results = mypool.map(test, args) + + sort_key = lambda entry: entry[0] + results.sort(key=sort_key) + + #for r in results: + # print(r) + # Workaround for pocl CUDA bug + # whereby times are imprecise + ret_index = 0 + for i, result in enumerate(results): + if result[0] > 1e-7: + ret_index = i + break + + avg_time, transformations, data = results[ret_index] + else: + transformations = {} + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + hjson.dump(od, out_file,default=convert) + out_file.close() + + return transformations + +""" +def main(args): + + # Create queue, assume all GPUs on the machine are the same + platforms = cl.get_platforms() + platform_id = 0 + gpu_devices = platforms[platform_id].get_devices(device_type=cl.device_type.GPU) + n_gpus = len(gpu_devices) + ctx = cl.Context(devices=[gpu_devices[charm.myPe() % n_gpus]]) + profiling = cl.command_queue_properties.PROFILING_ENABLE + queue = cl.CommandQueue(ctx, properties=profiling) + + assert charm.numPes() > 1 + #assert charm.numPes() - 1 <= charm.numHosts()*len(gpu_devices) + assert charm.numPes() <= charm.numHosts()*(len(gpu_devices) + 1) + # Check that it can assign one PE to each GPU + # The first PE is used for scheduling + # Not certain how this will work with multiple nodes + + from grudge.execution import diff_prg, elwise_linear_prg + knl = diff_prg(3, 1000000, 3, np.float64) + params = dgk.run_tests.gen_autotune_list(queue, knl) + + args = [[param, knl] for param in params] + + # May help to balance workload + from random import shuffle + shuffle(args) + + #a = Array(AutotuneTask, dims=(len(args)), args=args[0]) + #a.get_queue() + + #result = charm.pool.map(do_work, args) + + pool_proxy = Chare(BalancedPoolScheduler, onPE=0) + mypool = Pool(pool_proxy) + result = mypool.map(do_work, args) + + sort_key = lambda entry: entry[0] + result.sort(key=sort_key) + + + for r in result: + print(r) +""" + +def main(args): + import mpi4py.MPI as MPI + from mirgecom.array_context import MirgecomAutotuningArrayContext as Maac + comm = MPI.COMM_WORLD + + autotune_pickled_kernels("./pickled_programs", 0, Maac, comm) + print("DONE!") + exit() + +def charm_autotune(): + charm.start(main) + print(result) + charm.exit() + +if __name__ == "__main__": + charm.start(main) + print(result) + charm.exit() diff --git a/grudge/loopy_dg_kernels/resample_by_mat.hjson b/grudge/loopy_dg_kernels/resample_by_mat.hjson new file mode 100644 index 000000000..56f133597 --- /dev/null +++ b/grudge/loopy_dg_kernels/resample_by_mat.hjson @@ -0,0 +1,166 @@ +{ + 72a3ce98-5d21-48bf-b402-6ee96bafd1b6: { + description: "Transformations for the NVIDIA Titan V" + # 64-bit or 32-bit kernel + FP32:{ + # Polynomial order + 2:[ + # Format: [Transformation, args, kwargs] + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 3:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 20], {outer_tag: "g.1", inner_tag: "l.1"}], + #["split_iname", ["idof", 20], {outer_tag: "g.1"}], + #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["resample_mat", "idof,j"], {temporary_name: "matf", default_tag: "l.auto"}], + ], + 4:[ + # Move this to array context? + #["tag_array_axes", ["mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + #["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + + ], + 5:[ + ["split_iname", ["iel", 192], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + 6:[ + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 42], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 14], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 14], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 8], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + # Not optimized, just copied from 32 bit version + FP64: { + 2:[ + # Format: [Transformation, args, kwargs] + ["split_iname", ["iel", 128], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + ["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 3:[ + + ["split_iname", ["iel", 96], {outer_tag: "g.0", slabs:[0,1]}], + # For tests uncomment this + #["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 96], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,0]}], + ["split_iname", ["idof", 10], {outer_tag: "g.1", inner_tag: "l.1"}], + #["split_iname", ["idof", 20], {outer_tag: "g.1"}], + #["split_iname", ["idof_inner", 10], {outer_tag: "ilp", inner_tag: "l.1"}], + # For tests comment this + # Would need to specify shared memory and the location for this prefetch. It probably can't help + # anyway + #["add_prefetch", ["ary", "j"], {temporary_name: "aryf", default_tag: "l.auto"}], + # Maybe can stop random accesses from evicting matrix from cache by putting it in shared memory + ], + 4:[ + # Move this to array context? + #["tag_array_axes", ["mat", "sep,c,c"]], + #["tag_array_axes", ["result", "sep,f,f"]], + #["tag_array_axes", ["vec", "f,f"]], + + ["split_iname", ["iel", 32], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 32], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + # See if these pass the tests + #["split_iname", ["iel", 12], {outer_tag: "g.0", slabs:[0,1]}], + #["split_iname", ["iel_inner", 4], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + + #["split_iname", ["idof", 35], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["idof", 35], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 7], {outer_tag: "ilp", inner_tag: "l.1"}], + + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j"], {temporary_name: "matfp", default_tag: "unr"}], + ], + 5:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 56], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 8], {outer_tag: "ilp", inner_tag: "l.1"}], + + #["split_iname", ["idof", 56], {outer_tag: "g.1", inner_tag: "l.1"}], + #["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + + ["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "mat1fp", default_tag: "unr"}], + ] + 6:[ + ["split_iname", ["iel", 64], {outer_tag: "g.0", slabs:[0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs:[0,1]}], + ["split_iname", ["idof", 84], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}], + ["split_iname", ["j", 12], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ], + 7:[ + ["split_iname", ["iel", 48], {outer_tag: "g.0", slabs: [0,1]}], + ["split_iname", ["iel_inner", 16], {outer_tag: "ilp", inner_tag: "l.0", slabs: [0,1]}], + ["split_iname", ["idof", 120], {outer_tag: "g.1"}], + ["split_iname", ["idof_inner", 12], {outer_tag: "ilp", inner_tag: "l.1"}], + + #["split_iname", ["idof", 120], {outer_tag: "g.1", inner_tag: "l.1"}], + ["split_iname", ["j", 8], {outer_tag: "for", inner_tag: "for"}], + ["add_prefetch", ["vec", "j_outer,j_inner,iel_inner_outer,iel_inner_inner"], {temporary_name: "vecf", default_tag: "l.auto"}], + #["add_prefetch", ["mat", "j_inner"], {temporary_name: "matfp", default_tag: "unr"}], + + #["add_prefetch", ["mat2", "j_inner"], {temporary_name: "mat2fp", default_tag: "unr"}], + #["add_prefetch", ["mat3", "j_inner"], {temporary_name: "mat3fp", default_tag: "unr"}], + #["rename_iname", ["mat3_dim_1", "mat1_dim_1"], {existing_ok: true}], + #["rename_iname", ["mat2_dim_1", "mat1_dim_1"], {existing_ok: true}], + ] + } + } + 1d7cab16-19bd-4474-95f2-44ed1c0e60df: {} +} diff --git a/grudge/loopy_dg_kernels/roofline_plotting.py b/grudge/loopy_dg_kernels/roofline_plotting.py new file mode 100644 index 000000000..f17843cd1 --- /dev/null +++ b/grudge/loopy_dg_kernels/roofline_plotting.py @@ -0,0 +1,69 @@ +import matplotlib.pyplot as plt +import numpy as np + +max_flops_unboosted = 12288 # GFLOP/s +max_flops_boosted = 13444.5 # Empirical roofline toolkit + +max_g_bandwidth_warburton = 540 # GB/s +max_g_bandwidth_ert = 561.4 +max_l1_bandwidth = 2610.5 + +flops_per_byte_accessed = np.arange(0, 101) +max_flops_unboosted_array = max_flops_unboosted * \ + np.ones_like(flops_per_byte_accessed) + +max_flops_g_unboosted_data = np.minimum(flops_per_byte_accessed + * max_g_bandwidth_warburton, max_flops_unboosted_array) +max_flops_l1_unboosted_data = np.minimum(flops_per_byte_accessed + * max_l1_bandwidth, max_flops_unboosted_array) + +fig = plt.figure() +ax = fig.add_subplot(111) +ax.loglog(flops_per_byte_accessed, max_flops_g_unboosted_data, + label="Device memory roofline") +ax.loglog(flops_per_byte_accessed, max_flops_l1_unboosted_data, + label="L1 cache/Local memory roofline") + +theoretical_x_1 = 3*2*np.array([10, 20, 35, 56, 85, 120]) \ + / (4 + 12) # Assumes one read and three stores +theoretical_x_4 = 3*2*np.array([10, 20, 35, 56, 84, 120]) \ + / (4 + 12 + 12) # Assumes four reads and three stores +theoretical_x_7 = 3*2*np.array([10, 20, 35, 56, 84, 120]) \ + / (4 + 2*(12+12)) # Assumes seven reads and three stores +#theoretical_x = 2*np.arange(1,33) / (4 + 4) # Assumes one read and one stores +theoretical_y_1 = np.minimum(theoretical_x_1 + * max_g_bandwidth_warburton, max_flops_unboosted) +theoretical_y_4 = np.minimum(theoretical_x_4 + * max_g_bandwidth_warburton, max_flops_unboosted) +theoretical_y_7 = np.minimum(theoretical_x_7 + * max_g_bandwidth_warburton, max_flops_unboosted) +empirical_x = theoretical_x_4.copy() +#empirical_x[0:3] = theoretical_x_1[0:3] +empirical_y = [2026.9636053441898, 4049.8734098551745, 7085.0042493541905, + 8143.440577930807, 9010.054141132498, 10126.59788574097] +print(theoretical_x_1) +print(theoretical_y_1) +print(theoretical_x_4) +print(theoretical_y_4) + +pn_labels = ["2", "3", "4", "5", "6", "7"] + +plt.title("Grudge elementwise differentiation kernel: FP32") +ax.loglog(theoretical_x_1, theoretical_y_1, "sy", + label="4 device memory accesses model (3 writes, 1 read)", markersize=8) +ax.loglog(theoretical_x_4, theoretical_y_4, "ob", + label="7 device memory accesses model, (3 writes, 4 reads)") +#plt.loglog(theoretical_x_7, theoretical_y_7,"oy", label="13 accesses model") +ax.loglog(theoretical_x_1, empirical_y, ".r", + label="Experimental results assuming 4 accesses") +for i in range(6): + ax.annotate(pn_labels[i], x=(theoretical_x_1[i], empirical_y[i])) +ax.loglog(theoretical_x_4, empirical_y, ".g", + label="Experimental results assuming 7 accesses") +for i in range(6): + ax.annotate(pn_labels[i], xy=(theoretical_x_4[i], empirical_y[i])) +plt.ylabel("GFLOP/s") +plt.xlabel("Bytes per flop") +plt.legend() +#plt.yticks(theoretical_y) +plt.show() diff --git a/grudge/loopy_dg_kernels/run_tests.py b/grudge/loopy_dg_kernels/run_tests.py new file mode 100644 index 000000000..e2f46fb58 --- /dev/null +++ b/grudge/loopy_dg_kernels/run_tests.py @@ -0,0 +1,1296 @@ +import numpy as np + +import pyopencl as cl +import pyopencl.array +import pyopencl.clrandom + +import loopy as lp +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 +from grudge.loopy_dg_kernels import apply_transformation_list +from pyopencl.tools import ImmediateAllocator, MemoryPool +#from loopy.kernel.data import AddressSpace + +""" +import pycuda.gpuarray as cuarray +import pycuda.driver as drv +import pycuda.tools +import pycuda.autoinit +from pycuda.compiler import SourceModule +from pycuda.curandom import rand as curand +""" + +from modepy import equidistant_nodes +from pytools.obj_array import make_obj_array + +import hjson +import time +#from math import ceil +import sys + +# setup +# ----- +lp.set_caching_enabled(False) +import loopy.options +loopy.options.ALLOW_TERMINAL_COLORS = False + +from grudge.loopy_dg_kernels import (gen_diff_knl, gen_diff_knl_fortran2, + apply_transformation_list, gen_elwise_linear_knl, gen_face_mass_knl, gen_face_mass_knl_merged) +from grudge.grudge_tags import (IsDOFArray, IsSepVecDOFArray, IsOpArray, + IsSepVecOpArray, IsFaceDOFArray, IsFaceMassOpArray, IsVecDOFArray, IsVecOpArray, IsFourAxisDOFArray) +import grudge.grudge_array_context as gac#import set_memory_layout + +def testBandwidth(fp_format=np.float32, nruns=100): + + from pyopencl.array import sum as clsum + platform = cl.get_platforms() + my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU) + #ctx = cl.Context(devices=my_gpu_devices) + ctx = cl.create_some_context(interactive=True) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + from pyopencl.tools import ImmediateAllocator, MemoryPool + allocator = ImmediateAllocator(queue) + mem_pool = MemoryPool(allocator) + + + knl = lp.make_copy_kernel("c,c", old_dim_tags="c,c") + knl = lp.add_dtypes(knl, {"input": fp_format, "output": fp_format}) + knl = knl.copy(target=lp.PyOpenCLTarget(my_gpu_devices[0])) + n0 = 2 + #knl = lp.split_iname(knl, "i1", 1024//2, inner_tag="l.0", outer_tag="g.0", slabs=(0,1)) + knl = lp.split_iname(knl, "i1", 256, inner_tag="l.0", outer_tag="g.0", slabs=(0,1)) + #knl = lp.split_iname(knl, "i1", 6*16, outer_tag="g.0") + #knl = lp.split_iname(knl, "i1_inner", 16, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) + #knl = lp.split_iname(knl, "i0", n0, inner_tag="l.1", outer_tag="g.1", slabs=(0,0)) + + fp_bytes = 8 if fp_format == np.float64 else 4 + + # This assumes fp32 + len_list = [] + float_count = 1 + max_floats = 2**28 + while float_count <= max_floats: + len_list.append(float_count) + float_count = int(np.ceil(float_count*1.5)) + for i in range(29): + len_list.append(2**i) + len_list = sorted(list(set(len_list))) + + #data = np.random.randint(-127, 128, (1,max_bytes), dtype=np.int8) + #inpt = cl.array.to_device(queue, data, allocator=mem_pool) + + print(len_list) + + for n in len_list: + #for i in range(29): + + #n = 2**i + kern = lp.fix_parameters(knl, n0=n0, n1=n) + #data = np.random.randint(-127, 128, (1,n), dtype=np.int8) + #inpt = cl.array.to_device(queue, data, allocator=mem_pool) + inpt = cl.clrandom.rand(queue, (n0, n), dtype=fp_format) + outpt = cl.array.Array(queue, (n0, n), dtype=fp_format, allocator=mem_pool) + + #kern = lp.set_options(kern, "write_code") # Output code before editing it + + for j in range(2): + kern(queue, input=inpt, output=outpt) + dt = 0 + events = [] + for j in range(nruns): + evt, _ = kern(queue, input=inpt, output=outpt) + events.append(evt) + + cl.wait_for_events(events) + for evt in events: + dt += evt.profile.end - evt.profile.start + #queue.finish() + dt = dt / nruns / 1e9 + + nbytes_transferred = 2*fp_bytes*n*n0 + bandwidth = nbytes_transferred / dt / 1e9 + print("{} {}".format(nbytes_transferred, bandwidth)) + + #print((inpt - outpt)) + diff = (inpt - outpt) + if clsum(inpt - outpt) != 0: + print("INCORRECT COPY") + + +def test_face_mass_merged(kern, backend="OPENCL", nruns=10, warmup=True): + #kern = gen_diff_knl(n_elem, n_in, n_out, k_inner_outer, k_inner_inner, + # i_inner_outer, i_inner_inner, j_inner) + kern = lp.set_options(kern, "no_numpy") + kern = lp.set_options(kern, "return_dict") + for arg in kern.args: + if arg.name == "vec": + fp_format = arg.dtype + n_elem, n_in = arg.shape + elif arg.name == "mat": + n_out, _ = arg.shape + + CUDA = (backend == "CUDA") + OPENCL = not CUDA + + if CUDA: + print("Not supported") + exit() + elif OPENCL: + platform = cl.get_platforms() + my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU) + #ctx = cl.Context(devices=my_gpu_devices) + ctx = cl.create_some_context(interactive=True) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + #kern = lp.set_options(kern, edit_code=False) #Only works for OpenCL? + kern = lp.set_options(kern, "write_code") # Output code before editing it + # Print the Code + kern = kern.copy(target=lp.PyOpenCLTarget(my_gpu_devices[0])) + code = lp.generate_code_v2(kern).device_code() + prog = cl.Program(ctx, code) + prog = prog.build() + ptx = prog.get_info(cl.program_info.BINARIES)[0]#.decode( + #errors="ignore") #Breaks pocl + from bs4 import UnicodeDammit + dammit = UnicodeDammit(ptx) + #print(dammit.unicode_markup) + f = open("ptx.ptx", "w") + f.write(dammit.unicode_markup) + f.close() + + from pyopencl.tools import ImmediateAllocator, MemoryPool + allocator = ImmediateAllocator(queue) + mem_pool = MemoryPool(allocator) + + X_dev = cl.array.Array(queue, (n_elem, n_in), dtype=fp_format, order="F", allocator=mem_pool) + cl.clrandom.fill_rand(X_dev, queue=queue) + B_dev = cl.array.Array(queue, (n_elem, n_out), dtype=fp_format, allocator=mem_pool,order="F") + A_dev = cl.clrandom.rand(queue, (n_out, n_in), dtype=fp_format) + + if warmup: + for i in range(2): + kern(queue, result=B_dev, mat=A_dev, vec=X_dev) + queue.finish() + + sum_time = 0.0 + events = [] + for i in range(nruns): + evt, _ = kern(queue, result=B_dev, mat=A_dev, vec=X_dev) + events.append(evt) + + cl.wait_for_events(events) + for evt in events: + sum_time += evt.profile.end - evt.profile.start + sum_time = sum_time / 1e9 + #queue.finish() + + avg_time = sum_time / nruns + + return (B_dev, A_dev, X_dev), avg_time + +# Maybe the queue could also be a cuda stream? Could use the type of that to +# distinguish between CUDA and OpenCL possibly +# This hardcodes the memory layout, should probably instead retrieve it from somewhere on a per +# tag basis + +#cache_arg_dict = {} +def generic_test(queue, kern, backend="OPENCL", nruns=10, warmup=True): + + kern = lp.set_options(kern, "no_numpy") + kern = lp.set_options(kern, "return_dict") + + CUDA = (backend == "CUDA") + OPENCL = not CUDA + + if CUDA: + print("CUDA not supported") + exit() + elif OPENCL: + """ + platform = cl.get_platforms() + my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU) + ctx = cl.Context(devices=my_gpu_devices) + #ctx = cl.create_some_context(interactive=True) + #queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + #kern = lp.set_options(kern, edit_code=False) #Only works for OpenCL? + kern = lp.set_options(kern, "write_code") # Output code before editing it + # Print the Code + kern = kern.copy(target=lp.PyOpenCLTarget(my_gpu_devices[0])) + code = lp.generate_code_v2(kern).device_code() + prog = cl.Program(ctx, code) + prog = prog.build() + ptx = prog.get_info(cl.program_info.BINARIES)[0]#.decode( + #errors="ignore") #Breaks pocl + dammit = UnicodeDammit(ptx) + print(dammit.unicode_markup) + f = open("ptx.ptx", "w") + f.write(dammit.unicode_markup) + f.close() + """ + + allocator = ImmediateAllocator(queue) + mem_pool = MemoryPool(allocator) + + arg_dict = {} + + # Fill arrays with random data + # Could probably just read the strides from the kernel to get ordering + # Could probably move this to a separate function and memoize it + for arg in kern.default_entrypoint.args: + print(arg) + print(arg.dim_tags) + fp_bytes = arg.dtype.numpy_dtype.itemsize + strides = [fp_bytes*entry.stride for entry in arg.dim_tags] + + if True:#str(arg) not in cache_arg_dict: + if IsSepVecDOFArray() in arg.tags: + print(arg) + print(arg.dim_tags) + print("My strides:", strides) + print("VERIFY IF STRIDES IS CORRECT FOR SEPVECDOFARRAY") + exit() + obj_array = [cl.array.Array(queue, arg.shape[1:], dtype=arg.dtype, allocator=mem_pool, order="F") for i in range(arg.shape[0])] + array = make_obj_array(obj_array) + elif IsSepVecOpArray() in arg.tags: + print(arg) + print(arg.dim_tags) + print("My strides:", strides) + print("VERIFY IF STRIDES IS CORRECT FOR SEPVECOPARRAY") + exit() + obj_array = [cl.array.Array(queue, arg.shape[1:], dtype=arg.dtype, order="C", allocator=mem_pool) for i in range(arg.shape[0])] + array = make_obj_array(obj_array) + elif isinstance(arg, lp.ArrayArg): + print(f"Giving '{arg.name}' strides {strides}") + array = cl.array.Array(queue, arg.shape, arg.dtype, strides=strides, allocator=mem_pool) + print(arg.name) + + if not arg.is_output: + if isinstance(array, cl.array.Array): + #pass + #if arg.dtype.dtype == np.int8: + # data = np.random.randint(0, array.shape) + # array.set(data) + #else: + + # Handle generating random indices for resampling kernels + # This functionality should probably be moved to a separate + # test function. + if arg.name == "indices": + data_arg_shape = None + for data_arg in kern.default_entrypoint.args: + if data_arg.name == "ary": + data_arg_shape = data_arg.shape[0] + + cl.clrandom.fill_rand(array, queue=queue, a=0, b=data_arg_shape) + else: + cl.clrandom.fill_rand(array, queue=queue) + elif isinstance(array[0], cl.array.Array): + for entry in array: + #pass + cl.clrandom.fill_rand(entry, queue=queue) + else: + raise TypeError + + #cache_arg_dict[str(arg)] = array + #print(arg.name) + #print(arg.tags) + #print("Unknown Tag") + #exit() + + #arg_dict[arg.name] = cache_arg_dict[str(arg)] + arg_dict[arg.name] = array + + if warmup: + for i in range(2): + kern(queue, **arg_dict) + queue.finish() + + #""" + sum_time = 0.0 + events = [] + for i in range(nruns): + evt, out = kern(queue, **arg_dict) + events.append(evt) + + cl.wait_for_events(events) + for evt in events: + sum_time += evt.profile.end - evt.profile.start + sum_time = sum_time / 1e9 + #queue.finish() + #""" + + avg_time = sum_time / nruns + + return arg_dict, avg_time + + +def analyze_knl_bandwidth(knl, avg_time): + nbytes = 0 + # What if the output is not in the input arguments? + #print(knl.default_entrypoint.args) + # Would probably be better to use the memory footprint + # if can get it to work. + for arg in knl.default_entrypoint.args: + print(arg.name) + print(arg.shape) + print(type(arg.dtype)) + entries = np.prod((arg.shape)) + fp_bytes = arg.dtype.dtype.itemsize + nbytes += fp_bytes * entries + bw = nbytes / avg_time / 1e9 + + # Seems lp.gather_access_footprint_bytes breaks + #footprint = lp.gather_access_footprint_bytes(knl) + #footprint_bytes = 0 + #for val in footprint.values(): + # footprint_bytes += val.eval_with_dict({}) + #footprint_bw = footprint_bytes / avg_time / 1e9 + #print(f"Time: {avg_time}, Bytes: {nbytes}, Bandwidth: {bw} GB/s Footprint BW: {footprint_bw} GB/s") + + print(f"Time: {avg_time}, Bytes: {nbytes}, Bandwidth: {bw} GB/s") + return bw + + +def analyze_FLOPS(knl, avg_time, max_gflops=None): + + op_map = lp.get_op_map(knl, count_within_subscripts=False, subgroup_size=1) + #print(op_map) + map_flops = 0 + for val in op_map.values(): + map_flops += val.eval_with_dict({}) + gflop_rate = map_flops / avg_time / 1e9 + + """ + n_mat = 1 + nfaces = 1 + for arg in knl.default_entrypoint.args: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + elif IsSepVecOpArray() in arg.tags or IsVecOpArray() in arg.tags: + n_mat, n_out, n_in = arg.shape + elif IsOpArray() in arg.tags: + n_out, n_in = arg.shape + elif IsFaceDOFArray() in arg.tags: + nfaces, n_elem, n_in = arg.shape + + flops = nfaces*n_mat*2*(n_out * n_in * n_elem) + """ + gflop_rate = (map_flops / avg_time) * 1e-9 + print("GFLOP/s: " + str(gflop_rate)) + + #print("Map GFLOP/s: " + str(map_gflop_rate)) + #print(flops) + #print(map_flops) + + frac_peak_gflops = None + if max_gflops is not None: + print("Peak GFLOP/s: " + str(max_gflops)) + frac_peak_gflops = gflop_rate / max_gflops + print("Percent peak: " + str(100*(frac_peak_gflops))) + + print() + + # Calculate bandwidth + # Assumes each element only read once + #ideal_total_bytes_transferred = fp_bytes*(3*(n_out * n_elem) + (n_in * n_elem) + # + 3*(n_out * n_in)) + #GBps = (ideal_total_bytes_transferred / avg_time) / 1e9 + #frac_peak_GBps = GBps / device_memory_bandwidth + #print("GB/s: " + str(GBps)) + #print("Peak GB/s: " + str(device_memory_bandwidth)) + #print("Percent peak: " + str(100*(frac_peak_GBps))) + #print() + + return gflop_rate, frac_peak_gflops + + +def verifyResult(B_dev1, B_dev2, B_dev3, A_dev1, A_dev2, A_dev3, X_dev): + A_host1 = A_dev1.get() + A_host2 = A_dev2.get() + A_host3 = A_dev3.get() + X_host = X_dev.get() + B_host1 = B_dev1.get() + B_host2 = B_dev2.get() + B_host3 = B_dev3.get() + np.set_printoptions(threshold=sys.maxsize) + errMat = ((A_host1 @ X_host) - B_host1) / np.linalg.norm(A_host1 @ X_host) + print("Fraction Nonzero: " + str(np.count_nonzero(errMat)/(n_out*n_elem))) + print("Norm1: " + str(np.linalg.norm((A_host1 @ X_host) - B_host1) + / np.linalg.norm(A_host1 @ X_host))) + print("Norm2: " + str(np.linalg.norm((A_host2 @ X_host) - B_host2) + / np.linalg.norm(A_host2 @ X_host))) + print("Norm3: " + str(np.linalg.norm((A_host3 @ X_host) - B_host3) + / np.linalg.norm(A_host3 @ X_host))) + + +def verifyResultFortran(B_dev1, B_dev2, B_dev3, A_dev1, A_dev2, A_dev3, X_dev): + A_host1 = A_dev1.get() + A_host2 = A_dev2.get() + A_host3 = A_dev3.get() + X_host = X_dev.get().T + B_host1 = B_dev1.get() + B_host2 = B_dev2.get() + B_host3 = B_dev3.get() + np.set_printoptions(threshold=sys.maxsize) + errMat = ((A_host1 @ X_host).T - B_host1) / np.linalg.norm(A_host1 @ X_host) + print("Fraction Nonzero: " + str(np.count_nonzero(errMat)/(n_out*n_elem))) + print("Norm1: " + str(np.linalg.norm((A_host1 @ X_host).T - B_host1) + / np.linalg.norm(A_host1 @ X_host))) + print("Norm2: " + str(np.linalg.norm((A_host2 @ X_host).T - B_host2) + / np.linalg.norm(A_host2 @ X_host))) + print("Norm3: " + str(np.linalg.norm((A_host3 @ X_host).T - B_host3) + / np.linalg.norm(A_host3 @ X_host))) + + +# This can be removed eventually +def apply_transformations_and_run_test(queue, knl, test_fn, params, tgenerator, max_gflops=None, + device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None): + + kio, kii, iio, iii, ji = params + + # Transform and run + knl = gac.set_memory_layout(knl) + if applicator is not None: + trans_list = tgenerator(params) + else: + # Should probably read in eligible transformations from a file instead of using if-statements + trans_list = [] + if "diff" in knl.default_entrypoint.name: + trans_list.append(["tag_inames", ["imatrix: ilp"]]) + + trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["iel_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["idof_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + + if knl.default_entrypoint.name == "face_mass": + pass + #trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"], + # {"temporary_name":"vecf", "default_tag":"l.auto"}]) + #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]]) + elif knl.default_entrypoint.name == "nodes": + trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + elif "resample_by_mat" in knl.default_entrypoint.name: + # Indirection may prevent prefetching + pass + else: + trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + trans_list.append(["add_inames_for_unused_hw_axes"]) + + knl = apply_transformation_list(knl, trans_list) + + + #print(knl.default_entrypoint.name) + #print(trans_list) + + # Execute and analyze the results + dev_arrays, avg_time = test_fn(queue, knl) + #avg_time = np.random.rand() + + return avg_time, trans_list + + """ + # The analysis should be done elsewhere + bw = None + flop_rate = None + + if device_memory_bandwidth is not None: # noqa + bw = analyze_knl_bandwidth(knl, avg_time) + frac_peak_GBps = bw / device_memory_bandwidth + if frac_peak_GBps >= bandwidth_cutoff: # noqa + # Should validate result here + print("Performance is within tolerance of peak bandwith. Terminating search") # noqa + return avg_time, params + + # Einsum complicates this. This depends on the kernel being called. + if max_gflops is not None: + frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time) + if frac_peak_gflops >= gflops_cutoff: + # Should validate result here + print("Performance is within tolerance of peak bandwith or flop rate. Terminating search") # noqa + return choices + + if device_memory_bandwidth is not None and max_gflops is not None: + data = (avg_time, + frac_peak_GBps*device_memory_bandwidth, + frac_peak_gflops*max_gflops, + frac_peak_GBps, + frac_peak_gflops, + (kio, kii, iio, iii, ji)) + result_list.append(data) + f.write(str(data) + "\n") + + if avg_time < avg_time_saved: + avg_time_saved = avg_time + result_saved = choices + result_saved_list = trans_list + if time.time() - start > time_limit: + result_list.sort() + print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops") + for entry in result_list: + print(entry) + print() + + + #return result_saved_list + return result_saved + """ + +def run_single_param_set(queue, knl_base, tlist_generator, params, test_fn, max_gflops=None, device_memory_bandwidth=None): + trans_list = tlist_generator(params, knl=knl_base) + knl = apply_transformation_list(knl_base, trans_list) + dev_arrays, avg_time = test_fn(queue, knl) + + # Should this return the fraction of peak of should that be calculated in this function? + gflops, frac_peak_gflops = analyze_FLOPS(knl, avg_time, max_gflops=max_gflops) + bw = analyze_knl_bandwidth(knl, avg_time) + + if device_memory_bandwidth is not None: # noqa + bw = analyze_knl_bandwidth(knl, avg_time) + frac_peak_GBps = bw / device_memory_bandwidth + if frac_peak_GBps >= bandwidth_cutoff: # noqa + # Should validate result here + print("Performance is within tolerance of peak bandwith. Terminating search") # noqa + return choices + + # This is incorrect for general einsum kernels + if max_gflops is not None: + frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time) + if frac_peak_gflops >= gflops_cutoff: + # Should validate result here + print("Performance is within tolerance of peak bandwith or flop rate. Terminating search") # noqa + return choices + + data = None + if device_memory_bandwidth is not None and max_gflops is not None: + data = (frac_peak_GBps*device_memory_bandwidth, + frac_peak_gflops*max_gflops, + frac_peak_GBps, + frac_peak_gflops) + + return (avg_time, trans_list, data) + + +def exhaustive_search_v2(queue, knl, test_fn, pspace_generator, tlist_generator, time_limit=float("inf"), max_gflops=None, + device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None): + + + #param_list = gen_autotune_list(queue, knl, start_param=start_param) + + #Probably don't need all of these parameters + #apply_transformations_and_run_test(queue, knl, test_fn, params, max_gflops=None, + #device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None): + + # Should probably obtain device_memory_bandwidth from empirical tests + + # Also fixes the parameters. Maybe that should be a separate function + knl = gac.set_memory_layout(knl) + + knl_base = knl.copy() + + params_list = pspace_generator(queue, knl, start_param=start_param) + #print(knl) + #print(len(params_list)) + + result_list = [] + start = time.time() + + # Iterate over parameter space coordinates + # If serial run this otherwise, run the parallel autotuner + # Should probably make separate function for each. + for params in params_list: + print(f"Currently testing: {params}") + """ + trans_list = tlist_generator(params, knl=knl) + knl = apply_transformation_list(knl_base, trans_list) + dev_arrays, avg_time = test_fn(queue, knl) + + # Should this return the fraction of peak of should that be calculated in this function? + gflops, frac_peak_gflops = analyze_FLOPS(knl, avg_time, max_gflops=max_gflops) + bw = analyze_knl_bandwidth(knl, avg_time) + + if device_memory_bandwidth is not None: # noqa + bw = analyze_knl_bandwidth(knl, avg_time) + frac_peak_GBps = bw / device_memory_bandwidth + if frac_peak_GBps >= bandwidth_cutoff: # noqa + # Should validate result here + print("Performance is within tolerance of peak bandwith. Terminating search") # noqa + return choices + + # This is incorrect for general einsum kernels + if max_gflops is not None: + frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time) + if frac_peak_gflops >= gflops_cutoff: + # Should validate result here + print("Performance is within tolerance of peak bandwith or flop rate. Terminating search") # noqa + return choices + + data = None + if device_memory_bandwidth is not None and max_gflops is not None: + data = (frac_peak_GBps*device_memory_bandwidth, + frac_peak_gflops*max_gflops, + frac_peak_GBps, + frac_peak_gflops) + """ + + avg_time, trans_list, data = run_single_param_set(queue, knl_base, tlist_generator, params, test_fn, max_gflops=max_gflops, device_memory_bandwidth=device_memory_bandwidth) + result_list.append((avg_time, trans_list, data)) + print(avg_time) + #result_list.append(data) + #f.write(str(data) + "\n") + + #if avg_time < avg_time_saved: + # avg_time_saved = avg_time + # result_saved = choices + # result_saved_list = trans_list + + if time.time() - start > time_limit: + break + #result_list.sort() + #print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops") + #for entry in result_list: + # print(entry) + #print() + + #return result_saved_list + #return result_saved + + #print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops") + #for entry in result_list: + # print(entry) + #print() + + + + #print("Suggested loop splittings") + #print(result_saved) + #print(f"iel: {kio}") + #print(f"iel_inner: {kii}") + #print(f"idof: {iio}") + #print(f"idof_inner: {iii}") + #print(f"j: {ji}") + + #return result_saved_list + #return result_saved + + # Could save the highest performing function, but often one wants to see the results + # over the entire parameter space + key_func = lambda result: result[0] + sorted_results = sorted(result_list, key=key_func) + return sorted_results[0] + + +def exhaustive_search(queue, knl, test_fn, time_limit=float("inf"), max_gflops=None, + device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95, start_param=None): + + # Should probably obtain device_memory_bandwidth from empirical tests + + # Imports + from grudge.grudge_tags import ParameterValue + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + + avg_time_saved = float("inf") + result_saved = None + + transform_list = [] + + for arg in knl.default_entrypoint.args: + if "resample_by_mat" not in knl.default_entrypoint.name: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + #n_in = n_out # Not true for non-square + elif IsSepVecOpArray() in arg.tags: + n_mat, n_out, n_in = arg.shape + elif IsOpArray() in arg.tags: + n_out, n_in = arg.shape + elif IsFaceDOFArray() in arg.tags: + nfaces, n_elem, n_in = arg.shape + else: + if IsOpArray() in arg.tags: + n_out, n_in = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + + # Also fixes the parameters + knl = gac.set_memory_layout(knl) + + tested = [] + + if start_param is not None: + kio_s, kii_s, iio_s, iii_s, ji_s = start_param + else: + kio_s, kii_s, iio_s, iii_s, ji_s = (None, None, None, None, None) + + #k_inner_inner_opt = k_inner_inner_options(start_val=kii_s) + #kii_s = None + #j_inner_opt = j_inner_options(n_in) + knl_base = knl.copy() + + avg_time_saved = float("inf") + result_saved = None + result_saved_list = [] + + # Iterate over five search dimensions + result_list = [] + start = time.time() + with open("output.txt", "a") as f: + for kii in k_inner_inner_options(start_val=kii_s): + # This prevents shared memory from overflowing when running with the face mass kernel + if knl.default_entrypoint.name == "face_mass": + n_in_2 = n_in * nfaces + else: + n_in_2 = n_in + for kio in k_inner_outer_options(n_in_2, kii, local_mem_size, fp_bytes=fp_bytes,start_val=kio_s): + kio_s = None # Set to None so will form the full set the next time around + for iii in i_inner_inner_options(n_out, kii, + max_work_group_size=max_work_group_size, start_val=iii_s): + iii_s = None + for iio in i_inner_outer_options(n_out, iii, start_val=iio_s): + iio_s = None + for ji in j_inner_options(n_in, start_val=ji_s): + ji_s = None + print((kio, kii, iio, iii, ji)) + # Transform and run + knl = knl_base.copy() + knl = lp.split_iname(knl, "iel", kio, outer_tag="g.0", slabs=(0,1)) + knl = lp.split_iname(knl, "iel_inner", kii, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) + knl = lp.split_iname(knl, "idof", iio, outer_tag="g.1", slabs=(0,0)) + knl = lp.split_iname(knl, "idof_inner", iii, outer_tag="ilp", inner_tag="l.1", slabs=(0,0)) + + if knl.default_entrypoint.name == "face_mass": + knl = lp.add_prefetch(knl, "vec", "f,j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + #knl = lp.tag_array_axes(knl, "vecf", "N1,N0,N2") # Should be this but breaks + elif knl.default_entrypoint.name == "nodes": + knl = lp.add_prefetch(knl, "nodes", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + knl = lp.tag_array_axes(knl, "vecf", "f,f") + elif "resample_by_mat" in knl.default_entrypoint.name: # Reads are scattered so prefetching is difficult + pass + #knl = lp.add_prefetch(knl, "ary", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + #knl = lp.tag_array_axes(knl, "vecf", "f,f") + else: + knl = lp.add_prefetch(knl, "vec", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + knl = lp.tag_array_axes(knl, "vecf", "f,f") + + knl = lp.split_iname(knl, "j", ji, outer_tag="for", inner_tag="for") + knl = lp.add_inames_for_unused_hw_axes(knl) + + + # Change this to just use the transformation list instead of applying the transformations + # directly + trans_list = [] + if "diff" in knl.default_entrypoint.name: + trans_list.append(["tag_inames", ["imatrix: ilp"]]) + trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["iel_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["idof_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + + if knl.default_entrypoint.name == "face_mass": + pass + #trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"], + # {"temporary_name":"vecf", "default_tag":"l.auto"}]) + #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]]) + elif knl.default_entrypoint.name == "nodes": + trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + elif "resample_by_mat" in knl.default_entrypoint.name: + # Indirection may prevent prefetching + pass + else: + trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + trans_list.append(["add_inames_for_unused_hw_axes"]) + + print(knl.default_entrypoint.name) + print(trans_list) + + # Execute and analyze the results + dev_arrays, avg_time = test_fn(queue, knl) + + choices = (kio, kii, iio, iii, ji) + """ + if device_memory_bandwidth is not None: # noqa + #frac_peak_gflops, frac_peak_GBps = analyzeResult(n_out, + # n_in, n_elem, max_gflops, device_memory_bandwidth, + # avg_time) + bw = analyze_knl_bandwidth(knl, avg_time) + frac_peak_GBps = bw / device_memory_bandwidth + result_list.append((frac_peak_GBps, (kio, kii, iio, iii, ji))) + if frac_peak_GBps >= bandwidth_cutoff: # noqa + # Should validate result here + pass + #print("Performance is within tolerance of peak bandwith. Terminating search") # noqa + #return (kio, kii, iio, iii, ji) + """ + """ + # TODO: Fix flop calculation + if max_gflops is not None and device_memory_bandwidth is not None: # noqa + frac_peak_gflops, frac_peak_GBps = analyzeResult(n_out, + n_in, n_elem, max_gflops, device_memory_bandwidth, + avg_time) + if frac_peak_gflops >= gflops_cutoff or frac_peak_GBps >= bandwidth_cutoff: # noqa + # Should validate result here + print("Performance is within tolerance of peak bandwith or flop rate. Terminating search") # noqa + return (kio, kii, iio, iii, ji) + """ + print(choices) + if device_memory_bandwidth is not None: # noqa + bw = analyze_knl_bandwidth(knl, avg_time) + frac_peak_GBps = bw / device_memory_bandwidth + if frac_peak_GBps >= bandwidth_cutoff: # noqa + # Should validate result here + print("Performance is within tolerance of peak bandwith. Terminating search") # noqa + return choices + + if max_gflops is not None: + frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time) + if frac_peak_gflops >= gflops_cutoff: + # Should validate result here + print("Performance is within tolerance of peak bandwith or flop rate. Terminating search") # noqa + return choices + + if device_memory_bandwidth is not None and max_gflops is not None: + data = (avg_time, + frac_peak_GBps*device_memory_bandwidth, + frac_peak_gflops*max_gflops, + frac_peak_GBps, + frac_peak_gflops, + (kio, kii, iio, iii, ji)) + result_list.append(data) + f.write(str(data) + "\n") + + if avg_time < avg_time_saved: + avg_time_saved = avg_time + result_saved = choices + result_saved_list = trans_list + if time.time() - start > time_limit: + result_list.sort() + print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops") + for entry in result_list: + print(entry) + print() + + + #return result_saved_list + return result_saved + + + result_list.sort() + + print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops") + for entry in result_list: + print(entry) + print() + + + + print("Suggested loop splittings") + print(result_saved) + #print(f"iel: {kio}") + #print(f"iel_inner: {kii}") + #print(f"idof: {iio}") + #print(f"idof_inner: {iii}") + #print(f"j: {ji}") + + return result_saved_list + #return result_saved + +def random_search(queue, knl, test_fn, time_limit=float("inf"), max_gflops=None, + device_memory_bandwidth=None, gflops_cutoff=0.95, bandwidth_cutoff=0.95): + + # Imports + from random import choice + from grudge.grudge_tags import ParameterValue + + local_mem_size = queue.device.local_mem_size + max_work_group_size = queue.device.max_work_group_size + + avg_time_saved = float("inf") + result_saved = None + result_saved_list = [] + + # Get sizes + for arg in knl.default_entrypoint.args: + if "resample_by_mat" not in knl.default_entrypoint.name: + if IsDOFArray() in arg.tags: + n_elem, n_out = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + #n_in = n_out + elif IsSepVecOpArray() in arg.tags: + n_mat, n_out, n_in = arg.shape + elif IsOpArray() in arg.tags: + n_out, n_in = arg.shape + elif IsFaceDOFArray() in arg.tags: + nfaces, n_elem, n_in = arg.shape + else: + if IsOpArray() in arg.tags: + n_out, n_in = arg.shape + fp_bytes = arg.dtype.dtype.itemsize + + # Also fixes the parameters + knl = gac.set_memory_layout(knl) + + tested = [] + + k_inner_inner_opt = k_inner_inner_options() + j_inner_opt = j_inner_options(n_in) + knl_base = knl.copy() + result_list = [] + + start = time.time() + while(time.time() - start < time_limit): + # Can be more intelligent by ensuring choices are not run multiple times + # Maybe could use expressions + kii = choice(k_inner_inner_opt) + if knl.default_entrypoint.name == "face_mass": + kio = choice(k_inner_outer_options(n_in*nfaces, kii, local_mem_size, fp_bytes=fp_bytes)) + else: + kio = choice(k_inner_outer_options(n_in, kii, local_mem_size, fp_bytes=fp_bytes)) + iii = choice(i_inner_inner_options(n_out, kii, max_work_group_size=max_work_group_size)) + iio = choice(i_inner_outer_options(n_out, iii)) + ji = choice(j_inner_opt) + choices = (kio, kii, iio, iii, ji) + + if choices not in tested: + print(choices) + knl = knl_base.copy() + if "diff" in knl.default_entrypoint.name: + knl = lp.tag_inames(knl, "imatrix: ilp") + knl = lp.split_iname(knl, "iel", kio, outer_tag="g.0", slabs=(0,1)) + knl = lp.split_iname(knl, "iel_inner", kii, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) + knl = lp.split_iname(knl, "idof", iio, outer_tag="g.1", slabs=(0,0)) + knl = lp.split_iname(knl, "idof_inner", iii, outer_tag="ilp", inner_tag="l.1", slabs=(0,0)) + + if knl.default_entrypoint.name == "face_mass": + knl = lp.add_prefetch(knl, "vec", "f,j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + # Both N1,N0,N2 and N0,N1,N2 both seem to give memory errors.. + #knl = lp.tag_array_axes(knl, "vecf", "N1,N0,N2") + elif knl.default_entrypoint.name == "nodes": + knl = lp.add_prefetch(knl, "nodes", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + knl = lp.tag_array_axes(knl, "vecf", "f,f") + elif "resample_by_mat" in knl.default_entrypoint.name: + pass + # Indirection may prevent prefetching + #knl = lp.add_prefetch(knl, "ary", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + #knl = lp.tag_array_axes(knl, "vecf", "f,f") + else: + knl = lp.add_prefetch(knl, "vec", "j,iel_inner_outer,iel_inner_inner", temporary_name="vecf", default_tag="l.auto") + knl = lp.tag_array_axes(knl, "vecf", "f,f") + + knl = lp.split_iname(knl, "j", ji, outer_tag="for", inner_tag="for") + knl = lp.add_inames_for_unused_hw_axes(knl) + + # Change this to just use the transformation list instead of applying the transformations + # directly + trans_list = [] + if "diff" in knl.default_entrypoint.name: + trans_list.append(["tag_inames", ["imatrix: ilp"]]) + trans_list.append(["split_iname", ["iel", kio], {"outer_tag": "g.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["iel_inner", kii], + {"outer_tag": "ilp", "inner_tag":"l.0", "slabs":(0,1)}]) + trans_list.append(["split_iname", ["idof", iio], {"outer_tag": "g.1", "slabs":(0,0)}]) + trans_list.append(["split_iname", ["idof_inner", iii], + {"outer_tag": "ilp", "inner_tag":"l.1", "slabs":(0,1)}]) + + if knl.default_entrypoint.name == "face_mass": + trans_list.append(["add_prefetch", ["vec", "f,j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + #trans_list.append(["tag_array_axes", ["vecf", "N1,N0,N2"]]) + elif knl.default_entrypoint.name == "nodes": + trans_list.append(["add_prefetch", ["nodes", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + elif "resample_by_mat" in knl.default_entrypoint.name: + # Indirection may prevent prefetching + pass + else: + trans_list.append(["add_prefetch", ["vec", "j,iel_inner_outer,iel_inner_inner"], + {"temporary_name":"vecf", "default_tag":"l.auto"}]) + trans_list.append(["tag_array_axes", ["vecf", "f,f"]]) + + trans_list.append(["split_iname", ["j", ji], {"outer_tag":"for", "inner_tag":"for"}]) + trans_list.append(["add_inames_for_unused_hw_axes"]) + + dev_arrays, avg_time = test_fn(queue, knl) + tested.append(choices) + + print(choices) + if device_memory_bandwidth is not None: # noqa + bw = analyze_knl_bandwidth(knl, avg_time) + frac_peak_GBps = bw / device_memory_bandwidth + #result_list.append((frac_peak_GBps, (kio, kii, iio, iii, ji))) + if frac_peak_GBps >= bandwidth_cutoff: # noqa + # Should validate result here + print("Performance is within tolerance of peak bandwith. Terminating search") # noqa + return choices + + if max_gflops is not None: + frac_peak_gflops = analyze_FLOPS(knl, max_gflops, avg_time) + if frac_peak_gflops >= gflops_cutoff: + # Should validate result here + print("Performance is within tolerance of peak bandwith or flop rate. Terminating search") # noqa + return choices + + if device_memory_bandwidth is not None and max_gflops is not None: + result_list.append((avg_time, frac_peak_GBps*device_memory_bandwidth, frac_peak_gflops*max_gflops, + frac_peak_GBps, frac_peak_gflops, (kio, kii, iio, iii, ji))) + + if avg_time < avg_time_saved: + avg_time_saved = avg_time + result_saved = choices + result_saved_list = trans_list + + print("Time limit exceeded: returning current best result") + + """ + print("Suggested loop splittings") + print(f"iel: {kio}") + print(f"iel_inner: {kii}") + print(f"idof: {iio}") + print(f"idof_inner: {iii}") + print(f"j: {ji}") + """ + + result_list.sort() + + print("Avg_time, Peak_BW, Peak_GFLOPS, Frac_peak_bandwidth, Frac_peak_GFlops") + #print("Avg time, Frac peak bandwidth, Frac peak GFlops") + for entry in result_list: + print(entry) + print() + #print(result_list) + + + #return result_saved + return result_saved_list + +def convert(o): + if isinstance(o, np.generic): return o.item() + raise TypeError + + +def autotune_and_save(queue, search_fn, tlist_generator, pspace_generator, hjson_file_str, time_limit=np.inf): + from hjson import dump + try: + avg_time, transformations, data = search_fn(queue, program, generic_test, + pspace_generator, tlist_generator, time_limit=time_limit) + except cl._cl.RuntimeError as e: + print(e) + print("Profiling is not enabled and the PID does not match any transformation file. Turn on profiling and run again.") + + od = {"transformations": transformations} + out_file = open(hjson_file_str, "wt+") + + hjson.dump(od, out_file,default=convert) + out_file.close() + return transformations + + +def get_transformation_id(device_id): + hjson_file = open("device_mappings.hjson") + hjson_text = hjson_file.read() + hjson_file.close() + od = hjson.loads(hjson_text) + return od[device_id] + +if __name__ == "__main__": + from __init__ import gen_diff_knl, load_transformations_from_file, apply_transformation_list + from grudge.execution import diff_prg, elwise_linear_prg, face_mass_prg + + # Test existing optimizations + platform = cl.get_platforms() + my_gpu_devices = platform[0].get_devices(device_type=cl.device_type.GPU) + #ctx = cl.Context(devices=my_gpu_devices) + ctx = cl.create_some_context(interactive=True) + queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + + # Testing code + device_id = "NVIDIA Titan V" + tid = get_transformation_id("NVIDIA Titan V") + fp_format = np.float64 + fp_format_dict = {np.float32: (4, "FP32"), np.float64: (8, "FP64"), + np.complex128: (16, "C128")} + fp_bytes, fp_string = (8, "FP64") if fp_format == np.float64 else (4, "FP32") + + """ + to_test = True + if to_test: + n_elem = 2**22#2**15 # 2**21 + pn = 5 + print(len(equidistant_nodes(pn, 3)[1])) + n_out = len(equidistant_nodes(pn, 3)[1]) + n_in = len(equidistant_nodes(pn, 3)[1]) + + #settings = exhaustiveSearch(n_in, n_out, n_elem, 4*12*1024, fp_bytes=fp_bytes, + # max_gflops=12288, device_memory_bandwidth=540) + settings = randomSearch(n_in, n_out, n_elem, 4*12*1024, time_limit=120, + fp_format=fp_format, max_gflops=12288//2, + device_memory_bandwidth=540) + #settings = noSearch(n_in, n_out, n_elem, 4*12*1024, time_limit=180,1 + # fp_bytes=fp_bytes, max_gflops=12288, + # device_memory_bandwidth=540) + print("FINAL RESULTS") + print(settings) + # Add functionality to write transformations to file + """ + """ + dim_to_file = {1: "diff_1d_transform.hjson", + 2: "diff_2d_transform.hjson", + 3: "diff_3d_transform.hjson"} + + bandwidths = [] + from os import environ + for nreg in range(57,58):#range(1, 61): + environ['CU_JIT_MAX_REGISTERS'] = str(nreg) + for dim in range(3,4): + hjson_file = open(dim_to_file[dim]) + #for i in range(2,8): + pn = 5 + n_out = len(equidistant_nodes(pn, 3)[1]) + n_in = len(equidistant_nodes(pn, 3)[1]) + n_elem = 178746 # 2**20 + knl = diff_prg(dim, n_elem, n_out, fp_format) + #knl = gen_diff_knl_fortran2(dim, n_elem, n_out, n_in, fp_format=fp_format) + knl = set_memory_layout(knl) + knl = lp.set_options(knl, "write_code") + trans = load_transformations_from_file(hjson_file, [tid, fp_string, str(n_out)]) + knl = apply_transformation_list(knl, trans) + #print(lp.generate_code_v2(knl).device_code()) + + dev_arrays, avg_time = generic_test(queue, knl, nruns=10, warmup=True) + #dev_arrays, avg_time = runTest(n_elem, n_in, n_out, kio, kii, iio, iii, ji) + bw = analyze_knl_bandwidth(knl, avg_time) + bandwidths.append(bw) + #analyzeResult(n_out, n_in, n_elem, 12288//2, 540, avg_time, fp_bytes=fp_bytes) + print(avg_time) + #verifyResult(*dev_arrays) + + print(knl) + for i, entry in enumerate(bandwidths): + print(f"{i}, {entry}") + #print(bandwidths) + """ + #testBandwidth() + #exit() + """ + # Test elwise linear + pn = 4 + n_out = len(equidistant_nodes(pn,3)[1]) + n_in = n_out + n_elem = 178746 + fp_format = np.float64 + fp_string = "FP64" if fp_format == np.float64 else "FP32" + knl = elwise_linear_prg(n_elem, n_out, fp_format) + #knl = gen_elwise_linear_knl(n_elem, n_in, n_out, fp_format) + + hjson_file = open("elwise_linear_transform.hjson") + trans = load_transformations_from_file(hjson_file, [tid, fp_string, str(n_out)]) + + knl = set_memory_layout(knl) + knl = apply_transformation_list(knl, trans) + #print(knl) + _, avg_time = generic_test(queue, knl, backend="OPENCL", nruns=10, warmup=True) + print(avg_time) + analyze_knl_bandwidth(knl, avg_time) + """ + """ + # Test face_mass + pn = 3 + nvol_nodes = len(equidistant_nodes(pn,3)[1]) + nface_nodes = 10 + #nelements = 2**22 + nelements = 178746 + nfaces = 4 + fp_format = np.float64 + fp_string = "FP64" if fp_format == np.float64 else "FP32" + + knl = face_mass_prg(178746, 4, 20, 20, np.float64) + knl = set_memory_layout(knl) + #knl = gen_face_mass_knl(nelements, nfaces, nvol_nodes, nface_nodes, fp_format) + #knl = gen_face_mass_knl_merged(nelements, nfaces, nvol_nodes, nface_nodes, fp_format) + # Need to load these from file + #hjson_file = open("elwise_linear_transform.hjson") + #trans = load_transformations_from_file(hjson_file, [tid, fp_string, str(pn)]) + #knl = apply_transformation_list(knl, trans) + print(knl) + _, avg_time = test_face_mass(queue, knl, backend="OPENCL", nruns=10, warmup=True) + #_, avg_time = test_face_mass_merged(queue, knl, backend="OPENCL", nruns=10, warmup=True) + print(avg_time) + analyze_knl_bandwidth(knl, avg_time) + """ + + # Test order=4 copy + """ + knl = lp.make_copy_kernel("f,f", old_dim_tags="f,f") + knl = lp.add_dtypes(knl, {"input": np.float64, "output": np.float64}) + knl = lp.fix_parameters(knl, {"n0": 178746, "n1": 35}) + knl = lp.split_iname(knl, "i0", 48, outer_tag="g.0") + knl = lp.split_iname(knl, "i0_inner", 16, outer_tag="ilp", inner_tag="l.0") + knl = lp.split_iname(knl, "i1", 35, outer_tag="g.1", inner_tag="l.1") + for arg in knl.default_entrypoint.args: + if arg.name == "input": + arg.tags = IsDOFArray() + arg.shape = (178746, 35) + if arg.name == "output": + arg.tags = IsDOFArray() + arg.is_output = True + arg.shape = (178746, 35) + + print(knl) + _, avg_time = generic_test(queue, knl) + analyze_knl_bandwidth(knl, avg_time) + #knl = lp.split_iname(knl, "i1", 1024//2, inner_tag="l.0", outer_tag="g.0", slabs=(0,1)) + #knl = lp.split_iname(knl, "i1", 1024, inner_tag="l.0", outer_tag="g.0", slabs=(0,1)) + #knl = lp.split_iname(knl, "i1", 6*16, outer_tag="g.0") + #knl = lp.split_iname(knl, "i1_inner", 16, outer_tag="ilp", inner_tag="l.0", slabs=(0,1)) + #knl = lp.split_iname(knl, "i0", n0, inner_tag="l.1", outer_tag="g.1", slabs=(0,0)) + """ + + + #""" + # Test autotuner + knl = diff_prg(3, 1000000, 3, np.float64) + #print(knl) + #print(knl.default_entrypoint.domains) + #print(knl.default_entrypoint.instructions) + #exit() + #knl = diff_prg(3, 196608, 10, np.float64) + #knl = elwise_linear_prg(24576, 120, np.float64) + #dofs = 84 + #knl = elwise_linear_prg(1000000, 3*dofs, np.float64, nnodes_in=dofs) + #start_param = (24, 4, 126, 9, 28)#(96, 32, 60, 2, 5) + start_param = None + ## Figure out the actual dimensions + #knl = face_mass_prg(178746, 4, 20, 20, np.float64) + + # Spock + #result = exhaustive_search(queue, knl, generic_test, time_limit=np.inf, max_gflops=11540, device_memory_bandwidth=1047, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param) + #pspace_generator = gen_autotune_list(queue, knl) + #print(len(result)) + + # Titan V + #result = exhaustive_search(queue, knl, generic_test, time_limit=np.inf, max_gflops=6144, device_memory_bandwidth=580, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param) + #print(result) + pspace_generator = gen_autotune_list + tlist_generator = mxm_trans_list_generator + result = exhaustive_search_v2(queue, knl, generic_test, pspace_generator, tlist_generator, time_limit=np.inf, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param) + + #result = exhaustive_search_v2(queue, knl, generic_test, pspace_generator, tlist_generator, time_limit=np.inf, max_gflops=6144, device_memory_bandwidth=580, gflops_cutoff=0.95, bandwidth_cutoff=1.0, start_param=start_param) diff --git a/grudge/loopy_dg_kernels/test_import_mpi4py.py b/grudge/loopy_dg_kernels/test_import_mpi4py.py new file mode 100644 index 000000000..9bbbd91c9 --- /dev/null +++ b/grudge/loopy_dg_kernels/test_import_mpi4py.py @@ -0,0 +1,3 @@ +import mpi4py.MPI as MPI + +comm = MPI.COMM_WORLD diff --git a/grudge/models/advection.py b/grudge/models/advection.py index cfe1a4920..c5d35e2d1 100644 --- a/grudge/models/advection.py +++ b/grudge/models/advection.py @@ -214,13 +214,13 @@ def __init__(self, dcoll, v, inflow_u, flux_type="central", quad_tag=None): self.quad_tag = quad_tag def flux(self, u_tpair): - from grudge.dof_desc import DD_VOLUME + from grudge.dof_desc import DD_VOLUME_ALL - surf_v = op.project(self.dcoll, DD_VOLUME, u_tpair.dd, self.v) + surf_v = op.project(self.dcoll, DD_VOLUME_ALL, u_tpair.dd, self.v) return advection_weak_flux(self.dcoll, self.flux_type, u_tpair, surf_v) def operator(self, t, u): - from grudge.dof_desc import DOFDesc, DD_VOLUME, DTAG_VOLUME_ALL + from grudge.dof_desc import DOFDesc, DD_VOLUME_ALL, DTAG_VOLUME_ALL from meshmode.mesh import BTAG_ALL from meshmode.discretization.connection import FACE_RESTR_ALL @@ -234,7 +234,7 @@ def flux(tpair): return op.project(dcoll, tpair.dd, face_dd, self.flux(tpair)) def to_quad(arg): - return op.project(dcoll, DD_VOLUME, quad_dd, arg) + return op.project(dcoll, DD_VOLUME_ALL, quad_dd, arg) if self.inflow_u is not None: inflow_flux = flux(op.bv_trace_pair(dcoll, @@ -279,7 +279,7 @@ def to_quad(arg): # {{{ closed surface advection def v_dot_n_tpair(actx, dcoll, velocity, trace_dd): - from grudge.dof_desc import DTAG_BOUNDARY + from grudge.dof_desc import BoundaryDomainTag from grudge.trace_pair import TracePair from meshmode.discretization.connection import FACE_RESTR_INTERIOR @@ -287,10 +287,9 @@ def v_dot_n_tpair(actx, dcoll, velocity, trace_dd): v_dot_n = velocity.dot(normal) i = op.project(dcoll, trace_dd.with_discr_tag(None), trace_dd, v_dot_n) - if trace_dd.domain_tag is FACE_RESTR_INTERIOR: - e = dcoll.opposite_face_connection()(i) - elif isinstance(trace_dd.domain_tag, DTAG_BOUNDARY): - e = dcoll.distributed_boundary_swap_connection(trace_dd)(i) + assert isinstance(trace_dd.domain_tag, BoundaryDomainTag) + if trace_dd.domain_tag.tag is FACE_RESTR_INTERIOR: + e = dcoll.opposite_face_connection(trace_dd.domain_tag)(i) else: raise ValueError("Unrecognized domain tag: %s" % trace_dd.domain_tag) @@ -325,9 +324,9 @@ def __init__(self, dcoll, v, flux_type="central", quad_tag=None): self.quad_tag = quad_tag def flux(self, u_tpair): - from grudge.dof_desc import DD_VOLUME + from grudge.dof_desc import DD_VOLUME_ALL - surf_v = op.project(self.dcoll, DD_VOLUME, + surf_v = op.project(self.dcoll, DD_VOLUME_ALL, u_tpair.dd.with_discr_tag(None), self.v) return surface_advection_weak_flux(self.dcoll, self.flux_type, @@ -335,7 +334,7 @@ def flux(self, u_tpair): surf_v) def operator(self, t, u): - from grudge.dof_desc import DOFDesc, DD_VOLUME, DTAG_VOLUME_ALL + from grudge.dof_desc import DOFDesc, DD_VOLUME_ALL, DTAG_VOLUME_ALL from meshmode.discretization.connection import FACE_RESTR_ALL face_dd = DOFDesc(FACE_RESTR_ALL, self.quad_tag) @@ -347,7 +346,7 @@ def flux(tpair): return op.project(dcoll, tpair.dd, face_dd, self.flux(tpair)) def to_quad(arg): - return op.project(dcoll, DD_VOLUME, quad_dd, arg) + return op.project(dcoll, DD_VOLUME_ALL, quad_dd, arg) quad_v = to_quad(self.v) quad_u = to_quad(u) diff --git a/grudge/op.py b/grudge/op.py index 015e0718b..9e234a1d8 100644 --- a/grudge/op.py +++ b/grudge/op.py @@ -68,7 +68,6 @@ THE SOFTWARE. """ - from arraycontext import (ArrayContext, map_array_container, tag_axes, ArrayOrContainer) @@ -81,13 +80,23 @@ DiscretizationFaceAxisTag) from grudge.discretization import DiscretizationCollection +from grudge.dof_desc import as_dofdesc from pytools import keyed_memoize_in from pytools.obj_array import make_obj_array import numpy as np +import loopy as lp import grudge.dof_desc as dof_desc +from grudge.dof_desc import ( + DD_VOLUME_ALL, FACE_RESTR_ALL, DISCR_TAG_BASE, + DOFDesc, VolumeDomainTag +) + +from grudge.grudge_tags import (KernelDataTag, IsDOFArray, IsOpArray, + ParameterValue, IsVecOpArray, IsVecDOFArray, IsFourAxisDOFArray, + IsFaceMassOpArray, IsFaceDOFArray) from grudge.interpolation import interp from grudge.projection import project @@ -113,8 +122,10 @@ interior_trace_pair, interior_trace_pairs, local_interior_trace_pair, - connected_ranks, + inter_volume_trace_pairs, + local_inter_volume_trace_pairs, cross_rank_trace_pairs, + cross_rank_inter_volume_trace_pairs, bdry_trace_pair, bv_trace_pair ) @@ -142,8 +153,10 @@ "interior_trace_pair", "interior_trace_pairs", "local_interior_trace_pair", - "connected_ranks", + "inter_volume_trace_pairs", + "local_inter_volume_trace_pairs", "cross_rank_trace_pairs", + "cross_rank_inter_volume_trace_pairs", "bdry_trace_pair", "bv_trace_pair", @@ -174,6 +187,43 @@ def _single_axis_derivative_kernel( # - whether the chain rule terms ("inv_jac_mat") sit outside (strong) # or inside (weak) the matrix-vector product that carries out the # derivative, cf. "metric_in_matvec". + + data = [] + for out_grp, in_grp, vec_i, ijm_i in zip(out_discr.groups, in_discr.groups, vec, inv_jac_mat): + ref_stiffT_mat = get_diff_mat( + actx, + out_element_group=out_grp, + in_element_group=in_grp + ) + + fp_format = vec_i.dtype + Nr, Ni, _ = ref_stiffT_mat.shape + Ne, Nj = vec_i.shape + + kernel_data = [ + lp.GlobalArg("vec", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]), + lp.GlobalArg("ref_stiffT_mat", fp_format, shape=(Nr, Ni, Nj), offset=lp.auto, tags=[IsVecOpArray()]), + lp.GlobalArg("inv_jac_t", fp_format, shape=(Nr, Ne, Nj), offset=lp.auto, tags=[IsVecDOFArray()]), + lp.GlobalArg("out", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True), + lp.ValueArg("Ni", tags=[ParameterValue(Ni)]), + lp.ValueArg("Nj", tags=[ParameterValue(Nj)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + lp.ValueArg("Nr", tags=[ParameterValue(Nr)]), + ... + ] + + kd_tag = KernelDataTag(kernel_data) + + data.append(actx.einsum("rej,rij,ej->ei" if metric_in_matvec else "rei,rij,ej->ei", + ijm_i[xyz_axis], + ref_stiffT_mat, + vec_i, + arg_names=("inv_jac_t", "ref_stiffT_mat", "vec", ), + tagged=(FirstAxisIsElementsTag(),kd_tag))) + + return DOFArray(actx, data = tuple(data)) + + """ return DOFArray( actx, data=tuple( @@ -191,12 +241,58 @@ def _single_axis_derivative_kernel( for out_grp, in_grp, vec_i, ijm_i in zip( out_discr.groups, in_discr.groups, vec, inv_jac_mat))) + """ def _gradient_kernel(actx, out_discr, in_discr, get_diff_mat, inv_jac_mat, vec, *, metric_in_matvec): # See _single_axis_derivative_kernel for comments on the usage scenarios # (both strong and weak derivative) and their differences. + + per_group_grads = [] + for out_grp, in_grp, vec_i, ijm_i, in zip(out_discr.groups, in_discr.groups, vec, inv_jac_mat): + + ref_stiffT_mat = get_diff_mat( + actx, + out_element_group=out_grp, + in_element_group=in_grp + ) + + fp_format = vec_i.dtype + Nx, _, _, _ = inv_jac_mat._data[0].shape + Nr, Ni, _ = ref_stiffT_mat.shape + Ne, Nj = vec_i.shape + + kernel_data = [ + lp.GlobalArg("vec", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]), + lp.GlobalArg("ref_stiffT_mat", fp_format, shape=(Nr, Ni, Nj), offset=lp.auto, tags=[IsVecOpArray()]), + lp.GlobalArg("inv_jac_t", fp_format, shape=(Nx, Nr, Ne, Nj), offset=lp.auto, tags=[IsFourAxisDOFArray()]), + lp.GlobalArg("out", fp_format, shape=(Nx, Ne, Ni), offset=lp.auto, tags=[IsVecDOFArray()], is_output=True), + lp.ValueArg("Ni", tags=[ParameterValue(Ni)]), + lp.ValueArg("Nj", tags=[ParameterValue(Nj)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + lp.ValueArg("Nr", tags=[ParameterValue(Nr)]), + lp.ValueArg("Nx", tags=[ParameterValue(Nx)]), + ... + ] + + kd_tag = KernelDataTag(kernel_data) + + # r for rst axis + # x for xyz axis + per_group_grads.append(actx.einsum("xrej,rij,ej->xei" if metric_in_matvec else "xrei,rij,ej->xei", + ijm_i, + get_diff_mat( + actx, + out_element_group=out_grp, + in_element_group=in_grp + ), + vec_i, + arg_names=("inv_jac_t", "ref_stiffT_mat", "vec"), + tagged=(FirstAxisIsElementsTag(),kd_tag))) + + + """ per_group_grads = [ # r for rst axis # x for xyz axis @@ -213,6 +309,7 @@ def _gradient_kernel(actx, out_discr, in_discr, get_diff_mat, inv_jac_mat, vec, for out_grp, in_grp, vec_i, ijm_i in zip( out_discr.groups, in_discr.groups, vec, inv_jac_mat)] + """ return make_obj_array([ DOFArray( @@ -243,15 +340,13 @@ def get_ref_derivative_mats(grp): return get_ref_derivative_mats(out_element_group) -def _strong_scalar_grad(dcoll, dd_in, vec): - assert dd_in == dof_desc.as_dofdesc(dof_desc.DD_VOLUME) - +def _strong_scalar_grad(dcoll, dd, vec): from grudge.geometry import inverse_surface_metric_derivative_mat - discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME) + discr = dcoll.discr_from_dd(dd) actx = vec.array_context - inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, + inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd, _use_geoderiv_connection=actx.supports_nonscalar_broadcasting) return _gradient_kernel(actx, discr, discr, _reference_derivative_matrices, inverse_jac_mat, vec, @@ -259,7 +354,7 @@ def _strong_scalar_grad(dcoll, dd_in, vec): def local_grad( - dcoll: DiscretizationCollection, vec, *, nested=False) -> ArrayOrContainer: + dcoll: DiscretizationCollection, *args, nested=False) -> ArrayOrContainer: r"""Return the element-local gradient of a function :math:`f` represented by *vec*: @@ -268,24 +363,35 @@ def local_grad( \nabla|_E f = \left( \partial_x|_E f, \partial_y|_E f, \partial_z|_E f \right) + May be called with ``(vec)`` or ``(dd, vec)``. + :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` of them. + :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one. + Defaults to the base volume discretization if not provided. :arg nested: return nested object arrays instead of a single multidimensional array if *vec* is non-scalar. :returns: an object array (possibly nested) of :class:`~meshmode.dof_array.DOFArray`\ s or :class:`~arraycontext.ArrayContainer` of object arrays. """ - dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE) + if len(args) == 1: + vec, = args + dd = DD_VOLUME_ALL + elif len(args) == 2: + dd, vec = args + else: + raise TypeError("invalid number of arguments") + from grudge.tools import rec_map_subarrays return rec_map_subarrays( - partial(_strong_scalar_grad, dcoll, dd_in), + partial(_strong_scalar_grad, dcoll, dd), (), (dcoll.ambient_dim,), vec, scalar_cls=DOFArray, return_nested=nested,) def local_d_dx( - dcoll: DiscretizationCollection, xyz_axis, vec) -> ArrayOrContainer: + dcoll: DiscretizationCollection, xyz_axis, *args) -> ArrayOrContainer: r"""Return the element-local derivative along axis *xyz_axis* of a function :math:`f` represented by *vec*: @@ -293,22 +399,34 @@ def local_d_dx( \frac{\partial f}{\partial \lbrace x,y,z\rbrace}\Big|_E + May be called with ``(vec)`` or ``(dd, vec)``. + :arg xyz_axis: an integer indicating the axis along which the derivative is taken. + :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one. + Defaults to the base volume discretization if not provided. :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` of them. :returns: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` of them. """ + if len(args) == 1: + vec, = args + dd = DD_VOLUME_ALL + elif len(args) == 2: + dd, vec = args + else: + raise TypeError("invalid number of arguments") + if not isinstance(vec, DOFArray): - return map_array_container(partial(local_d_dx, dcoll, xyz_axis), vec) + return map_array_container(partial(local_d_dx, dcoll, xyz_axis, dd), vec) - discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME) + discr = dcoll.discr_from_dd(dd) actx = vec.array_context from grudge.geometry import inverse_surface_metric_derivative_mat - inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, - _use_geoderiv_connection=actx.supports_nonscalar_broadcasting) + inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd, + _use_geoderiv_connection=actx.supports_nonscalar_broadcasting) return _single_axis_derivative_kernel( actx, discr, discr, @@ -316,7 +434,7 @@ def local_d_dx( metric_in_matvec=False) -def local_div(dcoll: DiscretizationCollection, vecs) -> ArrayOrContainer: +def local_div(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: r"""Return the element-local divergence of the vector function :math:`\mathbf{f}` represented by *vecs*: @@ -324,6 +442,10 @@ def local_div(dcoll: DiscretizationCollection, vecs) -> ArrayOrContainer: \nabla|_E \cdot \mathbf{f} = \sum_{i=1}^d \partial_{x_i}|_E \mathbf{f}_i + May be called with ``(vec)`` or ``(dd, vec)``. + + :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one. + Defaults to the base volume discretization if not provided. :arg vecs: an object array of :class:`~meshmode.dof_array.DOFArray`\s or an :class:`~arraycontext.ArrayContainer` object @@ -332,13 +454,21 @@ def local_div(dcoll: DiscretizationCollection, vecs) -> ArrayOrContainer: :returns: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` of them. """ + if len(args) == 1: + vec, = args + dd = DD_VOLUME_ALL + elif len(args) == 2: + dd, vec = args + else: + raise TypeError("invalid number of arguments") + from grudge.tools import rec_map_subarrays return rec_map_subarrays( lambda vec: sum( - local_d_dx(dcoll, i, vec_i) + local_d_dx(dcoll, i, dd, vec_i) for i, vec_i in enumerate(vec)), (dcoll.ambient_dim,), (), - vecs, scalar_cls=DOFArray) + vec, scalar_cls=DOFArray) # }}} @@ -391,10 +521,12 @@ def get_ref_stiffness_transpose_mat(out_grp, in_grp): def _weak_scalar_grad(dcoll, dd_in, vec): from grudge.geometry import inverse_surface_metric_derivative_mat + dd_in = as_dofdesc(dd_in) in_discr = dcoll.discr_from_dd(dd_in) - out_discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME) + out_discr = dcoll.discr_from_dd(dd_in.with_discr_tag(DISCR_TAG_BASE)) actx = vec.array_context + # TODO: Figure out if this should be dd=dd_in or dd=dd_out inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd_in, times_area_element=True, _use_geoderiv_connection=actx.supports_nonscalar_broadcasting) @@ -429,7 +561,7 @@ def weak_local_grad( """ if len(args) == 1: vecs, = args - dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE) + dd_in = DD_VOLUME_ALL elif len(args) == 2: dd_in, vecs = args else: @@ -474,7 +606,7 @@ def weak_local_d_dx(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: """ if len(args) == 2: xyz_axis, vec = args - dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE) + dd_in = dof_desc.DD_VOLUME_ALL elif len(args) == 3: dd_in, xyz_axis, vec = args else: @@ -488,8 +620,9 @@ def weak_local_d_dx(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: from grudge.geometry import inverse_surface_metric_derivative_mat + dd_in = as_dofdesc(dd_in) in_discr = dcoll.discr_from_dd(dd_in) - out_discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME) + out_discr = dcoll.discr_from_dd(dd_in.with_discr_tag(DISCR_TAG_BASE)) actx = vec.array_context inverse_jac_mat = inverse_surface_metric_derivative_mat(actx, dcoll, dd=dd_in, @@ -533,7 +666,7 @@ def weak_local_div(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: """ if len(args) == 1: vecs, = args - dd_in = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE) + dd_in = DD_VOLUME_ALL elif len(args) == 2: dd_in, vecs = args else: @@ -602,6 +735,44 @@ def _apply_mass_operator( actx = vec.array_context area_elements = area_element(actx, dcoll, dd=dd_in, _use_geoderiv_connection=actx.supports_nonscalar_broadcasting) + + # out[e, i] = reduce(sum, [j], mass_mat[i, j]*jac[e, j]*vec[e, j]) + + esums = [] + for in_grp, out_grp, ae_i, vec_i in zip(in_discr.groups, out_discr.groups, area_elements, vec): + mass_mat = reference_mass_matrix( + actx, + out_element_group=out_grp, + in_element_group=in_grp + ) + + fp_format = vec_i.dtype + Ni, Nj = mass_mat.shape + Ne, Nj = vec_i.shape + kernel_data = [ + lp.GlobalArg("mass_mat", fp_format, shape=(Ni, Nj), offset=lp.auto, tags=[IsOpArray()]), + lp.GlobalArg("jac", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]), + lp.GlobalArg("vec", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]), + lp.GlobalArg("out", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True), + lp.ValueArg("Ni", tags=[ParameterValue(Ni)]), + lp.ValueArg("Nj", tags=[ParameterValue(Nj)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + ... + ] + + kd_tag = KernelDataTag(kernel_data) + + esum = actx.einsum("ij,ej,ej->ei", + mass_mat, + ae_i, + vec_i, + arg_names=("mass_mat", "jac", "vec"), + tagged=(FirstAxisIsElementsTag(),kd_tag)) + esums.append(esum) + + return DOFArray(actx, data=tuple(esums)) + + """ return DOFArray( actx, data=tuple( @@ -620,7 +791,8 @@ def _apply_mass_operator( in_discr.groups, out_discr.groups, area_elements, vec) ) ) - + """ + def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: r"""Return the action of the DG mass matrix on a vector (or vectors) @@ -628,7 +800,7 @@ def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: *vec* being an :class:`~arraycontext.ArrayContainer`, the mass operator is applied component-wise. - May be called with ``(vec)`` or ``(dd, vec)``. + May be called with ``(vec)`` or ``(dd_in, vec)``. Specifically, this function applies the mass matrix elementwise on a vector of coefficients :math:`\mathbf{f}` via: @@ -640,7 +812,7 @@ def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: where :math:`\phi_i` are local polynomial basis functions on :math:`E`. - :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one. + :arg dd_in: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one. Defaults to the base volume discretization if not provided. :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` of them. @@ -650,13 +822,15 @@ def mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: if len(args) == 1: vec, = args - dd = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE) + dd_in = dof_desc.DD_VOLUME_ALL elif len(args) == 2: - dd, vec = args + dd_in, vec = args else: raise TypeError("invalid number of arguments") - return _apply_mass_operator(dcoll, dof_desc.DD_VOLUME, dd, vec) + dd_out = dd_in.with_discr_tag(DISCR_TAG_BASE) + + return _apply_mass_operator(dcoll, dd_out, dd_in, vec) # }}} @@ -701,20 +875,44 @@ def _apply_inverse_mass_operator( discr = dcoll.discr_from_dd(dd_in) inv_area_elements = 1./area_element(actx, dcoll, dd=dd_in, _use_geoderiv_connection=actx.supports_nonscalar_broadcasting) - group_data = [ + + group_data = [] + for grp, jac_inv, vec_i in zip(discr.groups, inv_area_elements, vec): + + ref_mass_inverse = reference_inverse_mass_matrix(actx, + element_group=grp) + + fp_format = vec_i.dtype + Ne, Nj = vec_i.shape + _, Ni = jac_inv.shape + + kernel_data = [ + lp.GlobalArg("arg2", fp_format, shape=(Ne, Nj), offset=lp.auto, tags=[IsDOFArray()]), + lp.GlobalArg("arg1", fp_format, shape=(Ni, Nj), offset=lp.auto, tags=[IsOpArray()]), + lp.GlobalArg("arg0", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()]), + lp.GlobalArg("out", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True), + lp.ValueArg("Ni", tags=[ParameterValue(Ni)]), + lp.ValueArg("Nj", tags=[ParameterValue(Nj)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + ... + ] + + kd_tag = KernelDataTag(kernel_data) + + group_data.append( + # Based on https://arxiv.org/pdf/1608.03836.pdf # true_Minv ~ ref_Minv * ref_M * (1/jac_det) * ref_Minv actx.einsum("ei,ij,ej->ei", jac_inv, reference_inverse_mass_matrix(actx, element_group=grp), vec_i, - tagged=(FirstAxisIsElementsTag(),)) - for grp, jac_inv, vec_i in zip(discr.groups, inv_area_elements, vec)] + tagged=(FirstAxisIsElementsTag(),kd_tag,))) return DOFArray(actx, data=tuple(group_data)) -def inverse_mass(dcoll: DiscretizationCollection, vec) -> ArrayOrContainer: +def inverse_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: r"""Return the action of the DG mass matrix inverse on a vector (or vectors) of :class:`~meshmode.dof_array.DOFArray`\ s, *vec*. In the case of *vec* being an :class:`~arraycontext.ArrayContainer`, @@ -744,15 +942,24 @@ def inverse_mass(dcoll: DiscretizationCollection, vec) -> ArrayOrContainer: where :math:`\widehat{\mathbf{M}}` is the reference mass matrix on :math:`\widehat{E}`. + May be called with ``(vec)`` or ``(dd, vec)``. + :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` of them. + :arg dd: a :class:`~grudge.dof_desc.DOFDesc`, or a value convertible to one. + Defaults to the base volume discretization if not provided. :returns: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` like *vec*. """ + if len(args) == 1: + vec, = args + dd = DD_VOLUME_ALL + elif len(args) == 2: + dd, vec = args + else: + raise TypeError("invalid number of arguments") - return _apply_inverse_mass_operator( - dcoll, dof_desc.DD_VOLUME, dof_desc.DD_VOLUME, vec - ) + return _apply_inverse_mass_operator(dcoll, dd, dd, vec) # }}} @@ -850,23 +1057,76 @@ def get_ref_face_mass_mat(face_grp, vol_grp): return get_ref_face_mass_mat(face_element_group, vol_element_group) -def _apply_face_mass_operator(dcoll: DiscretizationCollection, dd, vec): +def _apply_face_mass_operator(dcoll: DiscretizationCollection, dd_in, vec): if not isinstance(vec, DOFArray): return map_array_container( - partial(_apply_face_mass_operator, dcoll, dd), vec + partial(_apply_face_mass_operator, dcoll, dd_in), vec ) from grudge.geometry import area_element - volm_discr = dcoll.discr_from_dd(dof_desc.DD_VOLUME) - face_discr = dcoll.discr_from_dd(dd) + dd_out = DOFDesc( + VolumeDomainTag(dd_in.domain_tag.volume_tag), + DISCR_TAG_BASE) + + volm_discr = dcoll.discr_from_dd(dd_out) + face_discr = dcoll.discr_from_dd(dd_in) dtype = vec.entry_dtype actx = vec.array_context assert len(face_discr.groups) == len(volm_discr.groups) - surf_area_elements = area_element(actx, dcoll, dd=dd, + surf_area_elements = area_element(actx, dcoll, dd=dd_in, _use_geoderiv_connection=actx.supports_nonscalar_broadcasting) + data = [] + for vgrp, afgrp, vec_i, surf_ae_i, in zip(volm_discr.groups, face_discr.groups, vec, surf_area_elements): + + + ref_fm_mat = reference_face_mass_matrix( + actx, + face_element_group=afgrp, + vol_element_group=vgrp, + dtype=dtype) + + fp_format = dtype + Ni, Nf, Nj = ref_fm_mat.shape + Ne = vgrp.nelements + + kernel_data = [ + lp.GlobalArg("vec", fp_format, shape=(Nf, Ne, Nj), offset=lp.auto, tags=[IsFaceDOFArray()]), + lp.GlobalArg("jac_surf", fp_format, shape=(Nf, Ne, Nj), offset=lp.auto, tags=[IsFaceDOFArray()]), + lp.GlobalArg("ref_face_mass_mat", fp_format, shape=(Ni, Nf, Nj), + offset=lp.auto, tags=[IsFaceMassOpArray()]), + lp.GlobalArg("out", fp_format, shape=(Ne, Ni), offset=lp.auto, tags=[IsDOFArray()], is_output=True), + lp.ValueArg("Ni", tags=[ParameterValue(Ni)]), + lp.ValueArg("Nj", tags=[ParameterValue(Nj)]), + lp.ValueArg("Ne", tags=[ParameterValue(Ne)]), + lp.ValueArg("Nf", tags=[ParameterValue(Nf)]), + ... + ] + + kd_tag = KernelDataTag(kernel_data) + + data.append(actx.einsum("ifj,fej,fej->ei", + ref_fm_mat, + actx.tag_axis(1, DiscretizationElementAxisTag(), surf_ae_i.reshape( + vgrp.mesh_el_group.nfaces, + vgrp.nelements, + surf_ae_i.shape[-1])), + actx.tag_axis(0, DiscretizationFaceAxisTag(), vec_i.reshape( + vgrp.mesh_el_group.nfaces, + vgrp.nelements, + afgrp.nunit_dofs)), + arg_names=("ref_face_mass_mat", "jac_surf", "vec"), + tagged=(FirstAxisIsElementsTag(),kd_tag))) + + + + + + return DOFArray(actx, data=tuple(data)) + + """ return DOFArray( actx, data=tuple( @@ -892,7 +1152,10 @@ def _apply_face_mass_operator(dcoll: DiscretizationCollection, dd, vec): for vgrp, afgrp, vec_i, surf_ae_i in zip(volm_discr.groups, face_discr.groups, vec, - surf_area_elements))) + surf_area_elements) + ) + ) + """ def face_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: @@ -901,7 +1164,7 @@ def face_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: *vec* being an arbitrary :class:`~arraycontext.ArrayContainer`, the face mass operator is applied component-wise. - May be called with ``(vec)`` or ``(dd, vec)``. + May be called with ``(vec)`` or ``(dd_in, vec)``. Specifically, this function applies the face mass matrix elementwise on a vector of coefficients :math:`\mathbf{f}` as the sum of contributions for @@ -932,13 +1195,13 @@ def face_mass(dcoll: DiscretizationCollection, *args) -> ArrayOrContainer: if len(args) == 1: vec, = args - dd = dof_desc.DOFDesc("all_faces", dof_desc.DISCR_TAG_BASE) + dd_in = DD_VOLUME_ALL.trace(FACE_RESTR_ALL) elif len(args) == 2: - dd, vec = args + dd_in, vec = args else: raise TypeError("invalid number of arguments") - return _apply_face_mass_operator(dcoll, dd, vec) + return _apply_face_mass_operator(dcoll, dd_in, vec) # }}} diff --git a/grudge/projection.py b/grudge/projection.py index 425239591..e21e02295 100644 --- a/grudge/projection.py +++ b/grudge/projection.py @@ -37,13 +37,19 @@ from arraycontext import ArrayOrContainer from grudge.discretization import DiscretizationCollection -from grudge.dof_desc import as_dofdesc +from grudge.dof_desc import ( + as_dofdesc, + VolumeDomainTag, + BoundaryDomainTag, + ConvertibleToDOFDesc) from numbers import Number def project( - dcoll: DiscretizationCollection, src, tgt, vec) -> ArrayOrContainer: + dcoll: DiscretizationCollection, + src: "ConvertibleToDOFDesc", + tgt: "ConvertibleToDOFDesc", vec) -> ArrayOrContainer: """Project from one discretization to another, e.g. from the volume to the boundary, or from the base to the an overintegrated quadrature discretization. @@ -55,10 +61,24 @@ def project( :returns: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` like *vec*. """ - src = as_dofdesc(src) - tgt = as_dofdesc(tgt) + # {{{ process dofdesc arguments - if isinstance(vec, Number) or src == tgt: + src_dofdesc = as_dofdesc(src) + + contextual_volume_tag = None + if isinstance(src_dofdesc.domain_tag, VolumeDomainTag): + contextual_volume_tag = src_dofdesc.domain_tag.tag + elif isinstance(src_dofdesc.domain_tag, BoundaryDomainTag): + contextual_volume_tag = src_dofdesc.domain_tag.volume_tag + + tgt_dofdesc = as_dofdesc(tgt, _contextual_volume_tag=contextual_volume_tag) + + del src + del tgt + + # }}} + + if isinstance(vec, Number) or src_dofdesc == tgt_dofdesc: return vec - return dcoll.connection_from_dds(src, tgt)(vec) + return dcoll.connection_from_dds(src_dofdesc, tgt_dofdesc)(vec) diff --git a/grudge/reductions.py b/grudge/reductions.py index 95ed44726..bdcf5a7f9 100644 --- a/grudge/reductions.py +++ b/grudge/reductions.py @@ -94,7 +94,7 @@ def norm(dcoll: DiscretizationCollection, vec, p, dd=None) -> Scalar: :returns: a nonegative scalar denoting the norm. """ if dd is None: - dd = dof_desc.DD_VOLUME + dd = dof_desc.DD_VOLUME_ALL from arraycontext import get_container_context_recursively actx = get_container_context_recursively(vec) @@ -128,7 +128,7 @@ def nodal_sum(dcoll: DiscretizationCollection, dd, vec) -> Scalar: if comm is None: return nodal_sum_loc(dcoll, dd, vec) - # NOTE: Don't move this + # NOTE: Do not move, we do not want to import mpi4py in single-rank computations from mpi4py import MPI from arraycontext import get_container_context_recursively @@ -174,7 +174,7 @@ def nodal_min(dcoll: DiscretizationCollection, dd, vec, *, initial=None) -> Scal if comm is None: return nodal_min_loc(dcoll, dd, vec, initial=initial) - # NOTE: Don't move this + # NOTE: Do not move, we do not want to import mpi4py in single-rank computations from mpi4py import MPI actx = vec.array_context @@ -231,7 +231,7 @@ def nodal_max(dcoll: DiscretizationCollection, dd, vec, *, initial=None) -> Scal if comm is None: return nodal_max_loc(dcoll, dd, vec, initial=initial) - # NOTE: Don't move this + # NOTE: Do not move, we do not want to import mpi4py in single-rank computations from mpi4py import MPI actx = vec.array_context @@ -320,7 +320,7 @@ def _apply_elementwise_reduction( """ if len(args) == 1: vec, = args - dd = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE) + dd = dof_desc.DD_VOLUME_ALL elif len(args) == 2: dd, vec = args else: @@ -335,6 +335,7 @@ def _apply_elementwise_reduction( actx = vec.array_context + import loopy as lp if actx.supports_nonscalar_broadcasting: return DOFArray( actx, @@ -344,11 +345,12 @@ def _apply_elementwise_reduction( ) ) else: - @memoize_in(actx, (_apply_elementwise_reduction, - "elementwise_%s_prg" % op_name)) - def elementwise_prg(): + @memoize_in(actx, (_apply_elementwise_reduction, dd, + "elementwise_%s_prg" % op_name)) + def elementwise_prg(nelements, ndofs, fp_format): # FIXME: This computes the reduction value redundantly for each # output DOF. + from grudge.grudge_tags import IsDOFArray, ParameterValue t_unit = make_loopy_program( [ "{[iel]: 0 <= iel < nelements}", @@ -357,21 +359,31 @@ def elementwise_prg(): """ result[iel, idof] = %s(jdof, operand[iel, jdof]) """ % op_name, + kernel_data=[ + lp.GlobalArg("result", fp_format, shape=("nelements", "ndofs"), + tags=[IsDOFArray()]), + lp.GlobalArg("operand", fp_format, shape=("nelements", "ndofs"), + tags=[IsDOFArray()]), + lp.ValueArg("ndofs", tags=[ParameterValue(ndofs)]), + lp.ValueArg("nelements", tags=[ParameterValue(nelements)]), + ... + ], name="grudge_elementwise_%s_knl" % op_name ) - import loopy as lp from meshmode.transform_metadata import ( ConcurrentElementInameTag, ConcurrentDOFInameTag) return lp.tag_inames(t_unit, { "iel": ConcurrentElementInameTag(), "idof": ConcurrentDOFInameTag()}) - return actx.tag_axis(1, DiscretizationDOFAxisTag(), - DOFArray( - actx, - data=tuple( - actx.call_loopy(elementwise_prg(), operand=vec_i)["result"] - for vec_i in vec))) + data = [] + for vec_i in vec: + iel, jdof = vec_i.shape + fp_format = vec_i.dtype + data.append(actx.call_loopy(elementwise_prg(iel, jdof, fp_format), + operand=vec_i)["result"]) + + return actx.tag_axis(1, DiscretizationDOFAxisTag(), DOFArray(actx, data=tuple(data))) def elementwise_sum( @@ -485,7 +497,7 @@ def elementwise_integral( """ if len(args) == 1: vec, = args - dd = dof_desc.DOFDesc("vol", dof_desc.DISCR_TAG_BASE) + dd = dof_desc.DD_VOLUME_ALL elif len(args) == 2: dd, vec = args else: diff --git a/grudge/shortcuts.py b/grudge/shortcuts.py index 0aca64a58..e6e62cc55 100644 --- a/grudge/shortcuts.py +++ b/grudge/shortcuts.py @@ -20,6 +20,8 @@ THE SOFTWARE. """ +from grudge.dof_desc import DD_VOLUME_ALL + from pytools import memoize_in @@ -76,11 +78,14 @@ def set_up_rk4(field_var_name, dt, fields, rhs, t_start=0.0): return dt_stepper -def make_visualizer(dcoll, vis_order=None, **kwargs): +def make_visualizer(dcoll, vis_order=None, volume_dd=None, **kwargs): from meshmode.discretization.visualization import make_visualizer + if volume_dd is None: + volume_dd = DD_VOLUME_ALL + return make_visualizer( dcoll._setup_actx, - dcoll.discr_from_dd("vol"), vis_order, **kwargs) + dcoll.discr_from_dd(volume_dd), vis_order, **kwargs) def make_boundary_visualizer(dcoll, vis_order=None, **kwargs): diff --git a/grudge/trace_pair.py b/grudge/trace_pair.py index cb2de38f6..333832340 100644 --- a/grudge/trace_pair.py +++ b/grudge/trace_pair.py @@ -18,12 +18,15 @@ .. autofunction:: bdry_trace_pair .. autofunction:: bv_trace_pair -Interior and cross-rank trace functions ---------------------------------------- +Interior, cross-rank, and inter-volume traces +--------------------------------------------- .. autofunction:: interior_trace_pairs .. autofunction:: local_interior_trace_pair +.. autofunction:: inter_volume_trace_pairs +.. autofunction:: local_inter_volume_trace_pairs .. autofunction:: cross_rank_trace_pairs +.. autofunction:: cross_rank_inter_volume_trace_pairs """ __copyright__ = """ @@ -51,17 +54,19 @@ """ -from typing import List, Hashable, Optional, Type, Any +from warnings import warn +from typing import List, Hashable, Optional, Tuple, Type, Any, Sequence, Mapping from pytools.persistent_dict import KeyBuilder from arraycontext import ( ArrayContainer, + ArrayContext, with_container_arithmetic, dataclass_array_container, - get_container_context_recursively, - flatten, to_numpy, - unflatten, from_numpy, + get_container_context_recursively_opt, + to_numpy, + from_numpy, ArrayOrContainer ) @@ -70,15 +75,20 @@ from numbers import Number from pytools import memoize_on_first_arg -from pytools.obj_array import obj_array_vectorize -from grudge.discretization import DiscretizationCollection +from grudge.discretization import DiscretizationCollection, PartID from grudge.projection import project from meshmode.mesh import BTAG_PARTITION import numpy as np + import grudge.dof_desc as dof_desc +from grudge.dof_desc import ( + DOFDesc, DD_VOLUME_ALL, FACE_RESTR_INTERIOR, DISCR_TAG_BASE, + VolumeTag, VolumeDomainTag, BoundaryDomainTag, + ConvertibleToDOFDesc, + ) # {{{ trace pair container class @@ -107,12 +117,22 @@ class TracePair: .. automethod:: __len__ """ - dd: dof_desc.DOFDesc + dd: DOFDesc interior: ArrayContainer exterior: ArrayContainer - def __init__(self, dd, *, interior, exterior): - object.__setattr__(self, "dd", dof_desc.as_dofdesc(dd)) + def __init__(self, dd: DOFDesc, *, + interior: ArrayOrContainer, + exterior: ArrayOrContainer): + if not isinstance(dd, DOFDesc): + warn("Constructing a TracePair with a first argument that is not " + "exactly a DOFDesc (but convertible to one) is deprecated. " + "This will stop working in July 2022. " + "Pass an actual DOFDesc instead.", + DeprecationWarning, stacklevel=2) + dd = dof_desc.as_dofdesc(dd) + + object.__setattr__(self, "dd", dd) object.__setattr__(self, "interior", interior) object.__setattr__(self, "exterior", exterior) @@ -178,7 +198,8 @@ def diff(self): # {{{ boundary trace pairs def bdry_trace_pair( - dcoll: DiscretizationCollection, dd, interior, exterior) -> TracePair: + dcoll: DiscretizationCollection, dd: "ConvertibleToDOFDesc", + interior, exterior) -> TracePair: """Returns a trace pair defined on the exterior boundary. Input arguments are assumed to already be defined on the boundary denoted by *dd*. If the input arguments *interior* and *exterior* are @@ -197,11 +218,19 @@ def bdry_trace_pair( be used for the flux. :returns: a :class:`TracePair` on the boundary. """ + if not isinstance(dd, DOFDesc): + warn("Calling bdry_trace_pair with a first argument that is not " + "exactly a DOFDesc (but convertible to one) is deprecated. " + "This will stop working in July 2022. " + "Pass an actual DOFDesc instead.", + DeprecationWarning, stacklevel=2) + dd = dof_desc.as_dofdesc(dd) return TracePair(dd, interior=interior, exterior=exterior) def bv_trace_pair( - dcoll: DiscretizationCollection, dd, interior, exterior) -> TracePair: + dcoll: DiscretizationCollection, dd: "ConvertibleToDOFDesc", + interior, exterior) -> TracePair: """Returns a trace pair defined on the exterior boundary. The interior argument is assumed to be defined on the volume discretization, and will therefore be restricted to the boundary *dd* prior to creating a @@ -223,21 +252,29 @@ def bv_trace_pair( be used for the flux. :returns: a :class:`TracePair` on the boundary. """ + if not isinstance(dd, DOFDesc): + warn("Calling bv_trace_pair with a first argument that is not " + "exactly a DOFDesc (but convertible to one) is deprecated. " + "This will stop working in July 2022. " + "Pass an actual DOFDesc instead.", + DeprecationWarning, stacklevel=2) + dd = dof_desc.as_dofdesc(dd) return bdry_trace_pair( - dcoll, dd, project(dcoll, "vol", dd, interior), exterior - ) + dcoll, dd, project(dcoll, dd.domain_tag.volume_tag, dd, interior), exterior) # }}} # {{{ interior trace pairs -def local_interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair: +def local_interior_trace_pair( + dcoll: DiscretizationCollection, vec, *, + volume_dd: Optional[DOFDesc] = None, + ) -> TracePair: r"""Return a :class:`TracePair` for the interior faces of *dcoll* with a discretization tag specified by *discr_tag*. This does not include interior faces on different MPI ranks. - :arg vec: a :class:`~meshmode.dof_array.DOFArray` or an :class:`~arraycontext.ArrayContainer` of them. @@ -250,21 +287,33 @@ def local_interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair computation. :returns: a :class:`TracePair` object. """ - i = project(dcoll, "vol", "int_faces", vec) + if volume_dd is None: + volume_dd = DD_VOLUME_ALL + + assert isinstance(volume_dd.domain_tag, VolumeDomainTag) + trace_dd = volume_dd.trace(FACE_RESTR_INTERIOR) + + interior = project(dcoll, volume_dd, trace_dd, vec) + + opposite_face_conn = dcoll.opposite_face_connection(trace_dd.domain_tag) - def get_opposite_face(el): - if isinstance(el, Number): - return el + def get_opposite_trace(ary): + if isinstance(ary, Number): + return ary else: - return dcoll.opposite_face_connection()(el) + return opposite_face_conn(ary) - e = obj_array_vectorize(get_opposite_face, i) + from arraycontext import rec_map_array_container + from meshmode.dof_array import DOFArray + exterior = rec_map_array_container( + get_opposite_trace, + interior, + leaf_class=DOFArray) - return TracePair("int_faces", interior=i, exterior=e) + return TracePair(trace_dd, interior=interior, exterior=exterior) def interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair: - from warnings import warn warn("`grudge.op.interior_trace_pair` is deprecated and will be dropped " "in version 2022.x. Use `local_interior_trace_pair` " "instead, or `interior_trace_pairs` which also includes contributions " @@ -274,7 +323,8 @@ def interior_trace_pair(dcoll: DiscretizationCollection, vec) -> TracePair: def interior_trace_pairs(dcoll: DiscretizationCollection, vec, *, - comm_tag: Hashable = None, tag: Hashable = None) -> List[TracePair]: + comm_tag: Hashable = None, tag: Hashable = None, + volume_dd: Optional[DOFDesc] = None) -> List[TracePair]: r"""Return a :class:`list` of :class:`TracePair` objects defined on the interior faces of *dcoll* and any faces connected to a parallel boundary. @@ -293,7 +343,6 @@ def interior_trace_pairs(dcoll: DiscretizationCollection, vec, *, """ if tag is not None: - from warnings import warn warn("Specifying 'tag' is deprecated and will stop working in July of 2022. " "Specify 'comm_tag' instead.", DeprecationWarning, stacklevel=2) if comm_tag is not None: @@ -302,154 +351,428 @@ def interior_trace_pairs(dcoll: DiscretizationCollection, vec, *, comm_tag = tag del tag + if volume_dd is None: + volume_dd = DD_VOLUME_ALL + return ( - [local_interior_trace_pair(dcoll, vec)] - + cross_rank_trace_pairs(dcoll, vec, comm_tag=comm_tag) + [local_interior_trace_pair( + dcoll, vec, volume_dd=volume_dd)] + + cross_rank_trace_pairs( + dcoll, vec, comm_tag=comm_tag, volume_dd=volume_dd) ) # }}} -# {{{ distributed-memory functionality +# {{{ inter-volume trace pairs + +def local_inter_volume_trace_pairs( + dcoll: DiscretizationCollection, + pairwise_volume_data: Mapping[ + Tuple[DOFDesc, DOFDesc], + Tuple[ArrayOrContainer, ArrayOrContainer]] + ) -> Mapping[Tuple[DOFDesc, DOFDesc], TracePair]: + for vol_dd_pair in pairwise_volume_data.keys(): + for vol_dd in vol_dd_pair: + if not isinstance(vol_dd.domain_tag, VolumeDomainTag): + raise ValueError( + "pairwise_volume_data keys must describe volumes, " + f"got '{vol_dd}'") + if vol_dd.discretization_tag != DISCR_TAG_BASE: + raise ValueError( + "expected base-discretized DOFDesc in pairwise_volume_data, " + f"got '{vol_dd}'") + + rank = ( + dcoll.mpi_communicator.Get_rank() + if dcoll.mpi_communicator is not None + else None) + + result: Mapping[Tuple[DOFDesc, DOFDesc], TracePair] = {} + + for vol_dd_pair, vol_data_pair in pairwise_volume_data.items(): + from meshmode.mesh import mesh_has_boundary + if not mesh_has_boundary( + dcoll.discr_from_dd(vol_dd_pair[0]).mesh, + BTAG_PARTITION(PartID(vol_dd_pair[1].domain_tag.tag, rank))): + continue + + directional_vol_dd_pairs = [ + (vol_dd_pair[1], vol_dd_pair[0]), + (vol_dd_pair[0], vol_dd_pair[1])] + + trace_dd_pair = tuple( + self_vol_dd.trace( + BTAG_PARTITION( + PartID(other_vol_dd.domain_tag.tag, rank))) + for other_vol_dd, self_vol_dd in directional_vol_dd_pairs) + + # Pre-compute the projections out here to avoid doing it twice inside + # the loop below + trace_data = { + trace_dd: project(dcoll, vol_dd, trace_dd, vol_data) + for vol_dd, trace_dd, vol_data in zip( + vol_dd_pair, trace_dd_pair, vol_data_pair)} + + for other_vol_dd, self_vol_dd in directional_vol_dd_pairs: + self_part_id = PartID(self_vol_dd.domain_tag.tag, rank) + other_part_id = PartID(other_vol_dd.domain_tag.tag, rank) + + self_trace_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id)) + other_trace_dd = other_vol_dd.trace(BTAG_PARTITION(self_part_id)) + + self_trace_data = trace_data[self_trace_dd] + unswapped_other_trace_data = trace_data[other_trace_dd] + + other_to_self = dcoll._inter_part_connections[ + other_part_id, self_part_id] + + def get_opposite_trace(ary): + if isinstance(ary, Number): + return ary + else: + return other_to_self(ary) # noqa: B023 + + from arraycontext import rec_map_array_container + from meshmode.dof_array import DOFArray + other_trace_data = rec_map_array_container( + get_opposite_trace, + unswapped_other_trace_data, + leaf_class=DOFArray) + + result[other_vol_dd, self_vol_dd] = TracePair( + self_trace_dd, + interior=self_trace_data, + exterior=other_trace_data) + + return result + + +def inter_volume_trace_pairs(dcoll: DiscretizationCollection, + pairwise_volume_data: Mapping[ + Tuple[DOFDesc, DOFDesc], + Tuple[ArrayOrContainer, ArrayOrContainer]], + comm_tag: Hashable = None) -> Mapping[ + Tuple[DOFDesc, DOFDesc], + List[TracePair]]: + """ + Note that :func:`local_inter_volume_trace_pairs` provides the rank-local + contributions if those are needed in isolation. Similarly, + :func:`cross_rank_inter_volume_trace_pairs` provides only the trace pairs + defined on cross-rank boundaries. + """ + # TODO documentation + + result: Mapping[ + Tuple[DOFDesc, DOFDesc], + List[TracePair]] = {} + + local_tpairs = local_inter_volume_trace_pairs(dcoll, pairwise_volume_data) + cross_rank_tpairs = cross_rank_inter_volume_trace_pairs( + dcoll, pairwise_volume_data, comm_tag=comm_tag) + + for directional_vol_dd_pair, tpair in local_tpairs.items(): + result[directional_vol_dd_pair] = [tpair] + + for directional_vol_dd_pair, tpairs in cross_rank_tpairs.items(): + result.setdefault(directional_vol_dd_pair, []).extend(tpairs) + + return result + +# }}} + + +# {{{ distributed: helper functions + +class _TagKeyBuilder(KeyBuilder): + def update_for_type(self, key_hash, key: Type[Any]): + self.rec(key_hash, (key.__module__, key.__name__, key.__name__,)) + @memoize_on_first_arg -def connected_ranks(dcoll: DiscretizationCollection): - from meshmode.distributed import get_connected_partitions - return get_connected_partitions(dcoll._volume_discr.mesh) +def _connected_parts( + dcoll: DiscretizationCollection, + self_volume_tag: VolumeTag, + other_volume_tag: VolumeTag + ) -> Sequence[PartID]: + result: List[PartID] = [ + connected_part_id + for connected_part_id, part_id in dcoll._inter_part_connections.keys() + if ( + part_id.volume_tag == self_volume_tag + and connected_part_id.volume_tag == other_volume_tag)] + + return result -class _RankBoundaryCommunication: +def _sym_tag_to_num_tag(comm_tag: Optional[Hashable]) -> Optional[int]: + if comm_tag is None: + return comm_tag + + if isinstance(comm_tag, int): + return comm_tag + + # FIXME: This isn't guaranteed to be correct. + # See here for discussion: + # - https://github.com/illinois-ceesd/mirgecom/issues/617#issuecomment-1057082716 # noqa + # - https://github.com/inducer/grudge/pull/222 + + from mpi4py import MPI + tag_ub = MPI.COMM_WORLD.Get_attr(MPI.TAG_UB) + key_builder = _TagKeyBuilder() + digest = key_builder(comm_tag) + + num_tag = sum(ord(ch) << i for i, ch in enumerate(digest)) % tag_ub + + warn("Encountered unknown symbolic tag " + f"'{comm_tag}', assigning a value of '{num_tag}'. " + "This is a temporary workaround, please ensure that " + "tags are sufficiently distinct for your use case.") + + return num_tag + +# }}} + + +# {{{ eager rank-boundary communication + +class _RankBoundaryCommunicationEager: base_comm_tag = 1273 def __init__(self, - dcoll: DiscretizationCollection, - array_container: ArrayOrContainer, - remote_rank, comm_tag: Optional[int] = None): - actx = get_container_context_recursively(array_container) - btag = BTAG_PARTITION(remote_rank) + actx: ArrayContext, + dcoll: DiscretizationCollection, + *, + local_part_id: PartID, + remote_part_id: PartID, + local_bdry_data: ArrayOrContainer, + remote_bdry_data_template: ArrayOrContainer, + comm_tag: Optional[Hashable] = None): - local_bdry_data = project(dcoll, "vol", btag, array_container) comm = dcoll.mpi_communicator + assert comm is not None + + remote_rank = remote_part_id.rank + assert remote_rank is not None self.dcoll = dcoll self.array_context = actx - self.remote_btag = btag - self.bdry_discr = dcoll.discr_from_dd(btag) + self.local_part_id = local_part_id + self.remote_part_id = remote_part_id + self.local_bdry_dd = DOFDesc( + BoundaryDomainTag( + BTAG_PARTITION(remote_part_id), + volume_tag=local_part_id.volume_tag), + DISCR_TAG_BASE) + self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd) self.local_bdry_data = local_bdry_data - self.local_bdry_data_np = \ - to_numpy(flatten(self.local_bdry_data, actx), actx) + self.remote_bdry_data_template = remote_bdry_data_template self.comm_tag = self.base_comm_tag + comm_tag = _sym_tag_to_num_tag(comm_tag) if comm_tag is not None: self.comm_tag += comm_tag + del comm_tag - # Here, we initialize both send and recieve operations through - # mpi4py `Request` (MPI_Request) instances for comm.Isend (MPI_Isend) - # and comm.Irecv (MPI_Irecv) respectively. These initiate non-blocking - # point-to-point communication requests and require explicit management - # via the use of wait (MPI_Wait, MPI_Waitall, MPI_Waitany, MPI_Waitsome), - # test (MPI_Test, MPI_Testall, MPI_Testany, MPI_Testsome), and cancel - # (MPI_Cancel). The rank-local data `self.local_bdry_data_np` will have its - # associated memory buffer sent across connected ranks and must not be - # modified at the Python level during this process. Completion of the - # requests is handled in :meth:`finish`. - # - # For more details on the mpi4py semantics, see: - # https://mpi4py.readthedocs.io/en/stable/overview.html#nonblocking-communications - # # NOTE: mpi4py currently (2021-11-03) holds a reference to the send # memory buffer for (i.e. `self.local_bdry_data_np`) until the send # requests is complete, however it is not clear that this is documented # behavior. We hold on to the buffer (via the instance attribute) # as well, just in case. - self.send_req = comm.Isend(self.local_bdry_data_np, - remote_rank, - tag=self.comm_tag) - self.remote_data_host_numpy = np.empty_like(self.local_bdry_data_np) - self.recv_req = comm.Irecv(self.remote_data_host_numpy, - remote_rank, - tag=self.comm_tag) + self.send_reqs = [] + self.send_data = [] + + def send_single_array(key, local_subary): + if not isinstance(local_subary, Number): + local_subary_np = to_numpy(local_subary, actx) + self.send_reqs.append( + comm.Isend(local_subary_np, remote_rank, tag=self.comm_tag)) + self.send_data.append(local_subary_np) + return local_subary + + self.recv_reqs = [] + self.recv_data = {} + + def recv_single_array(key, remote_subary_template): + if not isinstance(remote_subary_template, Number): + remote_subary_np = np.empty( + remote_subary_template.shape, + remote_subary_template.dtype) + self.recv_reqs.append( + comm.Irecv(remote_subary_np, remote_rank, tag=self.comm_tag)) + self.recv_data[key] = remote_subary_np + return remote_subary_template + + from arraycontext.container.traversal import rec_keyed_map_array_container + rec_keyed_map_array_container(send_single_array, local_bdry_data) + rec_keyed_map_array_container(recv_single_array, remote_bdry_data_template) def finish(self): - # Wait for the nonblocking receive request to complete before + from mpi4py import MPI + + # Wait for the nonblocking receive requests to complete before # accessing the data - self.recv_req.Wait() + MPI.Request.waitall(self.recv_reqs) - # Nonblocking receive is complete, we can now access the data and apply - # the boundary-swap connection - actx = self.array_context - remote_bdry_data_flat = from_numpy(self.remote_data_host_numpy, actx) - remote_bdry_data = unflatten(self.local_bdry_data, - remote_bdry_data_flat, actx) - bdry_conn = self.dcoll.distributed_boundary_swap_connection( - dof_desc.as_dofdesc(dof_desc.DTAG_BOUNDARY(self.remote_btag))) - swapped_remote_bdry_data = bdry_conn(remote_bdry_data) + def finish_single_array(key, remote_subary_template): + if isinstance(remote_subary_template, Number): + # NOTE: Assumes that the same number is passed on every rank + return remote_subary_template + else: + return from_numpy(self.recv_data[key], self.array_context) - # Complete the nonblocking send request associated with communicating - # `self.local_bdry_data_np` - self.send_req.Wait() + from arraycontext.container.traversal import rec_keyed_map_array_container + unswapped_remote_bdry_data = rec_keyed_map_array_container( + finish_single_array, self.remote_bdry_data_template) + + remote_to_local = self.dcoll._inter_part_connections[ + self.remote_part_id, self.local_part_id] - return TracePair(self.remote_btag, - interior=self.local_bdry_data, - exterior=swapped_remote_bdry_data) + def get_opposite_trace(ary): + if isinstance(ary, Number): + return ary + else: + return remote_to_local(ary) + from arraycontext import rec_map_array_container + from meshmode.dof_array import DOFArray + remote_bdry_data = rec_map_array_container( + get_opposite_trace, + unswapped_remote_bdry_data, + leaf_class=DOFArray) -from pytato import make_distributed_recv, staple_distributed_send + # Complete the nonblocking send requests + MPI.Request.waitall(self.send_reqs) + return TracePair( + self.local_bdry_dd, + interior=self.local_bdry_data, + exterior=remote_bdry_data) + +# }}} + + +# {{{ lazy rank-boundary communication class _RankBoundaryCommunicationLazy: def __init__(self, - dcoll: DiscretizationCollection, - array_container: ArrayOrContainer, - remote_rank: int, comm_tag: Hashable): + actx: ArrayContext, + dcoll: DiscretizationCollection, + *, + local_part_id: PartID, + remote_part_id: PartID, + local_bdry_data: ArrayOrContainer, + remote_bdry_data_template: ArrayOrContainer, + comm_tag: Optional[Hashable] = None) -> None: + if comm_tag is None: - raise ValueError("lazy communication requires 'tag' to be supplied") + raise ValueError("lazy communication requires 'comm_tag' to be supplied") + + remote_rank = remote_part_id.rank + assert remote_rank is not None self.dcoll = dcoll - self.array_context = get_container_context_recursively(array_container) - self.remote_btag = BTAG_PARTITION(remote_rank) - self.bdry_discr = dcoll.discr_from_dd(self.remote_btag) - - self.local_bdry_data = project( - dcoll, "vol", self.remote_btag, array_container) - - def communicate_single_array(key, local_bdry_ary): - ary_tag = (comm_tag, key) - return staple_distributed_send( - local_bdry_ary, dest_rank=remote_rank, comm_tag=ary_tag, - stapled_to=make_distributed_recv( + self.array_context = actx + self.local_bdry_dd = DOFDesc( + BoundaryDomainTag( + BTAG_PARTITION(remote_part_id), + volume_tag=local_part_id.volume_tag), + DISCR_TAG_BASE) + self.bdry_discr = dcoll.discr_from_dd(self.local_bdry_dd) + self.local_part_id = local_part_id + self.remote_part_id = remote_part_id + + from pytato import ( + make_distributed_recv, + make_distributed_send, + DistributedSendRefHolder) + + # TODO: This currently assumes that local_bdry_data and + # remote_bdry_data_template have the same structure. This is not true + # in general. Find a way to staple the sends appropriately when the number + # of recvs is not equal to the number of sends + # FIXME: Overly restrictive (just needs to be the same structure) + assert type(local_bdry_data) == type(remote_bdry_data_template) + + sends = {} + + def send_single_array(key, local_subary): + if isinstance(local_subary, Number): + return + else: + ary_tag = (comm_tag, key) + sends[key] = make_distributed_send( + local_subary, dest_rank=remote_rank, comm_tag=ary_tag) + + def recv_single_array(key, remote_subary_template): + if isinstance(remote_subary_template, Number): + # NOTE: Assumes that the same number is passed on every rank + return remote_subary_template + else: + ary_tag = (comm_tag, key) + return DistributedSendRefHolder( + sends[key], + make_distributed_recv( src_rank=remote_rank, comm_tag=ary_tag, - shape=local_bdry_ary.shape, dtype=local_bdry_ary.dtype, - axes=local_bdry_ary.axes)) + shape=remote_subary_template.shape, + dtype=remote_subary_template.dtype, + axes=remote_subary_template.axes)) from arraycontext.container.traversal import rec_keyed_map_array_container - self.remote_data = rec_keyed_map_array_container( - communicate_single_array, self.local_bdry_data) + + rec_keyed_map_array_container(send_single_array, local_bdry_data) + self.local_bdry_data = local_bdry_data + + self.unswapped_remote_bdry_data = rec_keyed_map_array_container( + recv_single_array, remote_bdry_data_template) def finish(self): - bdry_conn = self.dcoll.distributed_boundary_swap_connection( - dof_desc.as_dofdesc(dof_desc.DTAG_BOUNDARY(self.remote_btag))) + remote_to_local = self.dcoll._inter_part_connections[ + self.remote_part_id, self.local_part_id] + + def get_opposite_trace(ary): + if isinstance(ary, Number): + return ary + else: + return remote_to_local(ary) + + from arraycontext import rec_map_array_container + from meshmode.dof_array import DOFArray + remote_bdry_data = rec_map_array_container( + get_opposite_trace, + self.unswapped_remote_bdry_data, + leaf_class=DOFArray) + + return TracePair( + self.local_bdry_dd, + interior=self.local_bdry_data, + exterior=remote_bdry_data) - return TracePair(self.remote_btag, - interior=self.local_bdry_data, - exterior=bdry_conn(self.remote_data)) +# }}} -class _TagKeyBuilder(KeyBuilder): - def update_for_type(self, key_hash, key: Type[Any]): - self.rec(key_hash, (key.__module__, key.__name__, key.__name__,)) +# {{{ cross_rank_trace_pairs + +def _replace_dof_arrays(array_container, dof_array): + from arraycontext import rec_map_array_container + from meshmode.dof_array import DOFArray + return rec_map_array_container( + lambda x: dof_array if isinstance(x, DOFArray) else x, + array_container, + leaf_class=DOFArray) def cross_rank_trace_pairs( - dcoll: DiscretizationCollection, ary, - comm_tag: Hashable = None, - tag: Hashable = None) -> List[TracePair]: + dcoll: DiscretizationCollection, ary: ArrayOrContainer, + tag: Hashable = None, + *, comm_tag: Hashable = None, + volume_dd: Optional[DOFDesc] = None) -> List[TracePair]: r"""Get a :class:`list` of *ary* trace pairs for each partition boundary. For each partition boundary, the field data values in *ary* are - communicated to/from the neighboring partition. Presumably, this - communication is MPI (but strictly speaking, may not be, and this - routine is agnostic to the underlying communication). + communicated to/from the neighboring part. Presumably, this communication + is MPI (but strictly speaking, may not be, and this routine is agnostic to + the underlying communication). For each face on each partition boundary, a :class:`TracePair` is created with the locally, and @@ -472,61 +795,227 @@ def cross_rank_trace_pairs( :returns: a :class:`list` of :class:`TracePair` objects. """ + # {{{ process arguments + + if volume_dd is None: + volume_dd = DD_VOLUME_ALL + + if not isinstance(volume_dd.domain_tag, VolumeDomainTag): + raise TypeError(f"expected a volume DOFDesc, got '{volume_dd}'") + if volume_dd.discretization_tag != DISCR_TAG_BASE: + raise TypeError(f"expected a base-discretized DOFDesc, got '{volume_dd}'") + if tag is not None: - from warnings import warn warn("Specifying 'tag' is deprecated and will stop working in July of 2022. " - "Specify 'comm_tag' instead.", DeprecationWarning, stacklevel=2) + "Specify 'comm_tag' (keyword-only) instead.", + DeprecationWarning, stacklevel=2) if comm_tag is not None: raise TypeError("may only specify one of 'tag' and 'comm_tag'") else: comm_tag = tag del tag - if isinstance(ary, Number): - # NOTE: Assumed that the same number is passed on every rank - return [TracePair(BTAG_PARTITION(remote_rank), interior=ary, exterior=ary) - for remote_rank in connected_ranks(dcoll)] + # }}} + + if dcoll.mpi_communicator is None: + return [] + + rank = dcoll.mpi_communicator.Get_rank() + + local_part_id = PartID(volume_dd.domain_tag.tag, rank) + + connected_part_ids = _connected_parts( + dcoll, self_volume_tag=volume_dd.domain_tag.tag, + other_volume_tag=volume_dd.domain_tag.tag) + + remote_part_ids = [ + part_id + for part_id in connected_part_ids + if part_id.rank != rank] - actx = get_container_context_recursively(ary) + # This asserts that there is only one data exchange per rank, so that + # there is no risk of mismatched data reaching the wrong recipient. + # (Since we have only a single tag.) + assert len(remote_part_ids) == len({part_id.rank for part_id in remote_part_ids}) + + actx = get_container_context_recursively_opt(ary) + + if actx is None: + # NOTE: Assumes that the same number is passed on every rank + return [ + TracePair( + volume_dd.trace(BTAG_PARTITION(remote_part_id)), + interior=ary, exterior=ary) + for remote_part_id in remote_part_ids] from grudge.array_context import MPIPytatoArrayContextBase if isinstance(actx, MPIPytatoArrayContextBase): - rbc = _RankBoundaryCommunicationLazy + rbc_class = _RankBoundaryCommunicationLazy else: - rbc = _RankBoundaryCommunication - if comm_tag is not None: - num_tag: Optional[int] = None - if isinstance(comm_tag, int): - num_tag = comm_tag - - if num_tag is None: - # FIXME: This isn't guaranteed to be correct. - # See here for discussion: - # - https://github.com/illinois-ceesd/mirgecom/issues/617#issuecomment-1057082716 # noqa - # - https://github.com/inducer/grudge/pull/222 - from mpi4py import MPI - tag_ub = MPI.COMM_WORLD.Get_attr(MPI.TAG_UB) - key_builder = _TagKeyBuilder() - digest = key_builder(comm_tag) - num_tag = sum(ord(ch) << i for i, ch in enumerate(digest)) % tag_ub - - from warnings import warn - warn("Encountered unknown symbolic tag " - f"'{comm_tag}', assigning a value of '{num_tag}'. " - "This is a temporary workaround, please ensure that " - "tags are sufficiently distinct for your use case.") - - comm_tag = num_tag - - # Initialize and post all sends/receives - rank_bdry_communcators = [ - rbc(dcoll, ary, remote_rank, comm_tag=comm_tag) - for remote_rank in connected_ranks(dcoll) - ] - - # Complete send/receives and return communicated data - return [rc.finish() for rc in rank_bdry_communcators] + rbc_class = _RankBoundaryCommunicationEager + + rank_bdry_communicators = [] + + for remote_part_id in remote_part_ids: + bdry_dd = volume_dd.trace(BTAG_PARTITION(remote_part_id)) + + local_bdry_data = project(dcoll, volume_dd, bdry_dd, ary) + + from arraycontext import tag_axes + from meshmode.transform_metadata import ( + DiscretizationElementAxisTag, + DiscretizationDOFAxisTag) + remote_bdry_zeros = tag_axes( + actx, { + 0: DiscretizationElementAxisTag(), + 1: DiscretizationDOFAxisTag()}, + dcoll._inter_part_connections[ + remote_part_id, local_part_id].from_discr.zeros(actx)) + + remote_bdry_data_template = _replace_dof_arrays( + local_bdry_data, remote_bdry_zeros) + + rank_bdry_communicators.append( + rbc_class(actx, dcoll, + local_part_id=local_part_id, + remote_part_id=remote_part_id, + local_bdry_data=local_bdry_data, + remote_bdry_data_template=remote_bdry_data_template, + comm_tag=comm_tag)) + + return [rbc.finish() for rbc in rank_bdry_communicators] + +# }}} + + +# {{{ cross_rank_inter_volume_trace_pairs + +def cross_rank_inter_volume_trace_pairs( + dcoll: DiscretizationCollection, + pairwise_volume_data: Mapping[ + Tuple[DOFDesc, DOFDesc], + Tuple[ArrayOrContainer, ArrayOrContainer]], + *, comm_tag: Hashable = None, + ) -> Mapping[ + Tuple[DOFDesc, DOFDesc], + List[TracePair]]: + # FIXME: Should this interface take in boundary data instead? + # TODO: Docs + r"""Get a :class:`list` of *ary* trace pairs for each partition boundary. + + :arg comm_tag: a hashable object used to match sent and received data + across ranks. Communication will only match if both endpoints specify + objects that compare equal. A generalization of MPI communication + tags to arbitary, potentially composite objects. + + :returns: a :class:`list` of :class:`TracePair` objects. + """ + # {{{ process arguments + + for vol_dd_pair in pairwise_volume_data.keys(): + for vol_dd in vol_dd_pair: + if not isinstance(vol_dd.domain_tag, VolumeDomainTag): + raise ValueError( + "pairwise_volume_data keys must describe volumes, " + f"got '{vol_dd}'") + if vol_dd.discretization_tag != DISCR_TAG_BASE: + raise ValueError( + "expected base-discretized DOFDesc in pairwise_volume_data, " + f"got '{vol_dd}'") + + # }}} + + if dcoll.mpi_communicator is None: + return [] + + rank = dcoll.mpi_communicator.Get_rank() + + for vol_data_pair in pairwise_volume_data.values(): + for vol_data in vol_data_pair: + actx = get_container_context_recursively_opt(vol_data) + if actx is not None: + break + if actx is not None: + break + + def get_remote_connected_parts(local_vol_dd, remote_vol_dd): + connected_part_ids = _connected_parts( + dcoll, self_volume_tag=local_vol_dd.domain_tag.tag, + other_volume_tag=remote_vol_dd.domain_tag.tag) + return [ + part_id + for part_id in connected_part_ids + if part_id.rank != rank] + + if actx is None: + # NOTE: Assumes that the same number is passed on every rank for a + # given volume + return { + (remote_vol_dd, local_vol_dd): [ + TracePair( + local_vol_dd.trace(BTAG_PARTITION(remote_part_id)), + interior=local_vol_ary, exterior=remote_vol_ary) + for remote_part_id in get_remote_connected_parts( + local_vol_dd, remote_vol_dd)] + for (remote_vol_dd, local_vol_dd), (remote_vol_ary, local_vol_ary) + in pairwise_volume_data.items()} + + from grudge.array_context import MPIPytatoArrayContextBase + + if isinstance(actx, MPIPytatoArrayContextBase): + rbc_class = _RankBoundaryCommunicationLazy + else: + rbc_class = _RankBoundaryCommunicationEager + + rank_bdry_communicators = {} + + for vol_dd_pair, vol_data_pair in pairwise_volume_data.items(): + directional_volume_data = { + (vol_dd_pair[0], vol_dd_pair[1]): (vol_data_pair[0], vol_data_pair[1]), + (vol_dd_pair[1], vol_dd_pair[0]): (vol_data_pair[1], vol_data_pair[0])} + + for dd_pair, data_pair in directional_volume_data.items(): + other_vol_dd, self_vol_dd = dd_pair + other_vol_data, self_vol_data = data_pair + + self_part_id = PartID(self_vol_dd.domain_tag.tag, rank) + other_part_ids = get_remote_connected_parts(self_vol_dd, other_vol_dd) + + rbcs = [] + + for other_part_id in other_part_ids: + self_bdry_dd = self_vol_dd.trace(BTAG_PARTITION(other_part_id)) + self_bdry_data = project( + dcoll, self_vol_dd, self_bdry_dd, self_vol_data) + + from arraycontext import tag_axes + from meshmode.transform_metadata import ( + DiscretizationElementAxisTag, + DiscretizationDOFAxisTag) + other_bdry_zeros = tag_axes( + actx, { + 0: DiscretizationElementAxisTag(), + 1: DiscretizationDOFAxisTag()}, + dcoll._inter_part_connections[ + other_part_id, self_part_id].from_discr.zeros(actx)) + + other_bdry_data_template = _replace_dof_arrays( + other_vol_data, other_bdry_zeros) + + rbcs.append( + rbc_class(actx, dcoll, + local_part_id=self_part_id, + remote_part_id=other_part_id, + local_bdry_data=self_bdry_data, + remote_bdry_data_template=other_bdry_data_template, + comm_tag=comm_tag)) + + rank_bdry_communicators[other_vol_dd, self_vol_dd] = rbcs + + return { + directional_vol_dd_pair: [rbc.finish() for rbc in rbcs] + for directional_vol_dd_pair, rbcs in rank_bdry_communicators.items()} # }}} diff --git a/prepare-and-run-flake8.sh b/prepare-and-run-flake8.sh new file mode 100755 index 000000000..d22a4d874 --- /dev/null +++ b/prepare-and-run-flake8.sh @@ -0,0 +1,9 @@ +#! /bin/bash + +curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh +source ci-support.sh + +print_status_message +clean_up_repo_and_working_env +create_and_set_up_virtualenv +install_and_run_flake8 "$@" diff --git a/requirements.txt b/requirements.txt index 2107e5aeb..5e21bd1e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,18 @@ numpy mpi4py +gmsh +hjson git+https://github.com/inducer/pytools.git#egg=pytools git+https://github.com/inducer/pymbolic.git#egg=pymbolic git+https://github.com/inducer/islpy.git#egg=islpy git+https://github.com/inducer/pyopencl.git#egg=pyopencl -git+https://github.com/inducer/loopy.git#egg=loopy +git+https://github.com/inducer/loopy.git@more-0-strides-fixing#egg=loopy git+https://github.com/inducer/dagrt.git#egg=dagrt git+https://github.com/inducer/leap.git#egg=leap git+https://github.com/inducer/meshpy.git#egg=meshpy git+https://github.com/inducer/modepy.git#egg=modepy +git+https://github.com/nchristensen/meshmode.git@dof_tagging#egg=meshmode git+https://github.com/inducer/arraycontext.git#egg=arraycontext -git+https://github.com/inducer/meshmode.git#egg=meshmode git+https://github.com/inducer/pyvisfile.git#egg=pyvisfile git+https://github.com/inducer/pymetis.git#egg=pymetis git+https://github.com/illinois-ceesd/logpyle.git#egg=logpyle diff --git a/setup.cfg b/setup.cfg index da67c8630..9b8c87fc4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,6 +12,8 @@ per-file-ignores = test/test_op.py:B023 test/test_euler_model.py:B023 +#per-file-ignores = +# grudge/loopy_dg_kernels/run_tests.py:N806, N803, N802 # enable-flake8-bugbear [mypy] diff --git a/setup.py b/setup.py index adcbd6c02..aeb776597 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ def main(): version_dict = {} init_filename = "grudge/version.py" + exec(compile(open(init_filename, "r").read(), init_filename, "exec"), version_dict) @@ -47,9 +48,12 @@ def main(): "meshmode>=2020.2", "pyopencl>=2013.1", "pymbolic>=2013.2", - "loopy>=2020.2", + "loopy>=2020.2.2", "cgen>=2013.1.2", - "dataclasses>=0.7;python_version<='3.6'" + "hjson", + #"gmsh", + "import_resources; python_version<'3.7'", + "dataclasses>=0.7; python_version<='3.6'" ], ) diff --git a/test/test_grudge.py b/test/test_grudge.py index ce0b199b8..9b119d9d7 100644 --- a/test/test_grudge.py +++ b/test/test_grudge.py @@ -38,19 +38,64 @@ from pytools.obj_array import flat_obj_array -from grudge import DiscretizationCollection +from grudge import DiscretizationCollection, make_discretization_collection import grudge.dof_desc as dof_desc import grudge.op as op import pytest +from meshmode.array_context import generate_pytest_generate_tests +from grudge.grudge_array_context import GrudgeArrayContext +pytest_generate_tests = generate_pytest_generate_tests(GrudgeArrayContext) import logging logger = logging.getLogger(__name__) +# { {{ inverse metric + +@pytest.mark.parametrize("dim", [2, 3]) +def test_inverse_metric(actx_factory, dim): + actx = actx_factory() + + mesh = mgen.generate_regular_rect_mesh(a=(-0.5,)*dim, b=(0.5,)*dim, + nelements_per_axis=(6,)*dim, order=4) + + def m(x): + result = np.empty_like(x) + result[0] = ( + 1.5*x[0] + np.cos(x[0]) + + 0.1*np.sin(10*x[1])) + result[1] = ( + 0.05*np.cos(10*x[0]) + + 1.3*x[1] + np.sin(x[1])) + if len(x) == 3: + result[2] = x[2] + return result + + from meshmode.mesh.processing import map_mesh + mesh = map_mesh(mesh, m) + + dcoll = DiscretizationCollection(actx, mesh, order=4) + + from grudge.geometry import \ + forward_metric_derivative_mat, inverse_metric_derivative_mat + + mat = forward_metric_derivative_mat(actx, dcoll).dot( + inverse_metric_derivative_mat(actx, dcoll)) + + for i in range(mesh.dim): + for j in range(mesh.dim): + tgt = 1 if i == j else 0 + + err = flat_norm(mat[i, j] - tgt, ord=np.inf) + logger.info("error[%d, %d]: %.5e", i, j, err) + assert err < 1.0e-12, (i, j, err) + +# }}} + # {{{ mass operator trig integration @pytest.mark.parametrize("ambient_dim", [1, 2, 3]) @@ -341,7 +386,10 @@ def test_face_normal_surface(actx_factory, mesh_name): surf_normal = surf_normal / actx.np.sqrt(sum(surf_normal**2)) face_normal_i = actx.thaw(dcoll.normal(df)) - face_normal_e = dcoll.opposite_face_connection()(face_normal_i) + face_normal_e = dcoll.opposite_face_connection( + dof_desc.BoundaryDomainTag( + dof_desc.FACE_RESTR_INTERIOR, dof_desc.VTAG_ALL) + )(face_normal_i) if mesh.ambient_dim == 3: from grudge.geometry import pseudoscalar, area_element @@ -618,10 +666,9 @@ def f(x): or eoc_local.order_estimate() > order - 0.5 # }}} - - # {{{ models: advection + @pytest.mark.parametrize(("mesh_name", "mesh_pars"), [ ("segment", [8, 16, 32]), ("disk", [0.07, 0.02, 0.01]), @@ -780,9 +827,9 @@ def rhs(t, u): # }}} - # {{{ models: maxwell + @pytest.mark.parametrize("order", [3, 4, 5]) def test_convergence_maxwell(actx_factory, order): """Test whether 3D Maxwell's actually converges""" @@ -857,9 +904,9 @@ def rhs(t, w): # }}} - # {{{ models: variable coefficient advection oversampling + @pytest.mark.parametrize("order", [2, 3, 4]) def test_improvement_quadrature(actx_factory, order): """Test whether quadrature improves things and converges""" @@ -941,6 +988,34 @@ def zero_inflow(dtag, t=0): # }}} +# {{{ operator collector determinism + + +def test_op_collector_order_determinism(): + class TestOperator(sym.Operator): + + def __init__(self): + sym.Operator.__init__(self, dof_desc.DD_VOLUME, dof_desc.DD_VOLUME) + + mapper_method = "map_test_operator" + + from grudge.symbolic.mappers import BoundOperatorCollector + + class TestBoundOperatorCollector(BoundOperatorCollector): + + def map_test_operator(self, expr): + return self.map_operator(expr) + + v0 = sym.var("v0") + ob0 = sym.OperatorBinding(TestOperator(), v0) + + v1 = sym.var("v1") + ob1 = sym.OperatorBinding(TestOperator(), v1) + + # The output order isn't significant, but it should always be the same. + assert list(TestBoundOperatorCollector(TestOperator)(ob0 + ob1)) == [ob0, ob1] + +# }}} # {{{ bessel @@ -978,6 +1053,47 @@ def bessel_j(actx, n, r): # }}} +# {{{ function symbol + + +def test_external_call(actx_factory): + actx = actx_factory() + + def double(queue, x): + return 2 * x + + dims = 2 + + mesh = mgen.generate_regular_rect_mesh( + a=(0,) * dims, b=(1,) * dims, nelements_per_axis=(4,) * dims) + discr = DiscretizationCollection(actx, mesh, order=1) + + ones = sym.Ones(dof_desc.DD_VOLUME) + op = ( + ones * 3 + + sym.FunctionSymbol("double")(ones)) + + from grudge.function_registry import ( + base_function_registry, register_external_function) + + freg = register_external_function( + base_function_registry, + "double", + implementation=double, + dd=dof_desc.DD_VOLUME) + + bound_op = bind(discr, op, function_registry=freg) + + result = bound_op(actx, double=double) + assert actx.to_numpy(flatten(result) == 5).all() + + +@pytest.mark.parametrize("array_type", ["scalar", "vector"]) +def test_function_symbol_array(actx_factory, array_type): + """Test if `FunctionSymbol` distributed properly over object arrays.""" + + +# {{{ test norms @pytest.mark.parametrize("p", [2, np.inf]) def test_norm_real(actx_factory, p): @@ -1042,6 +1158,10 @@ def test_norm_obj_array(actx_factory, p): logger.info("norm: %.5e %.5e", norm, ref_norm) assert abs(norm-ref_norm) / abs(ref_norm) < 1e-14 +# }}} + + +# {{{ empty boundaries def test_empty_boundary(actx_factory): # https://github.com/inducer/grudge/issues/54 @@ -1061,10 +1181,39 @@ def test_empty_boundary(actx_factory): assert isinstance(component, DOFArray) assert len(component) == len(dcoll.discr_from_dd(BTAG_NONE).groups) +# }}} + + +# {{{ multi-volume + +def test_multi_volume(actx_factory): + dim = 2 + actx = actx_factory() + + mesh = mgen.generate_regular_rect_mesh( + a=(-0.5,)*dim, b=(0.5,)*dim, + nelements_per_axis=(8,)*dim, order=4) + + meg, = mesh.groups + x = mesh.vertices[0, meg.vertex_indices] + x_elem_avg = np.sum(x, axis=1)/x.shape[1] + volume_per_element = (x_elem_avg > 0).astype(np.int32) + + from meshmode.distributed import membership_list_to_map + volume_to_elements = membership_list_to_map(volume_per_element) + + from meshmode.mesh.processing import partition_mesh + volume_to_mesh = partition_mesh(mesh, volume_to_elements) + + make_discretization_collection(actx, volume_to_mesh, order=4) + +# }}} + # You can test individual routines by typing # $ python test_grudge.py 'test_routine()' + if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_mpi_communication.py b/test/test_mpi_communication.py index 47100b415..6d1d26f40 100644 --- a/test/test_mpi_communication.py +++ b/test/test_mpi_communication.py @@ -31,6 +31,7 @@ import logging import sys +from grudge.grudge_array_context import GrudgeArrayContext from grudge.array_context import MPIPyOpenCLArrayContext, MPIPytatoArrayContext logger = logging.getLogger(__name__) @@ -45,6 +46,7 @@ from pytools.obj_array import flat_obj_array import grudge.op as op +import grudge.dof_desc as dof_desc class SimpleTag: @@ -153,7 +155,10 @@ def hopefully_zero(): return ( op.project( dcoll, "int_faces", "all_faces", - dcoll.opposite_face_connection()(int_faces_func) + dcoll.opposite_face_connection( + dof_desc.BoundaryDomainTag( + dof_desc.FACE_RESTR_INTERIOR, dof_desc.VTAG_ALL) + )(int_faces_func) ) + sum(op.project(dcoll, tpair.dd, "all_faces", tpair.ext) for tpair in op.cross_rank_trace_pairs(dcoll, myfunc, @@ -170,7 +175,6 @@ def hopefully_zero(): assert error < 1e-14 - # }}} diff --git a/test/test_reductions.py b/test/test_reductions.py index 9e7387bad..e0e424fe4 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -149,7 +149,7 @@ def f(x): min_res = np.empty(grp_f.shape) max_res = np.empty(grp_f.shape) sum_res = np.empty(grp_f.shape) - for eidx in range(dcoll.mesh.nelements): + for eidx in range(mesh.nelements): element_data = actx.to_numpy(grp_f[eidx]) min_res[eidx, :] = np.min(element_data) max_res[eidx, :] = np.max(element_data) @@ -272,7 +272,7 @@ def _get_ref_data(field): min_res = np.empty(grp_f.shape) max_res = np.empty(grp_f.shape) sum_res = np.empty(grp_f.shape) - for eidx in range(dcoll.mesh.nelements): + for eidx in range(mesh.nelements): element_data = actx.to_numpy(grp_f[eidx]) min_res[eidx, :] = np.min(element_data) max_res[eidx, :] = np.max(element_data)