diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py index ce90fbdf2e..c9fb51ea0e 100644 --- a/devito/core/autotuning.py +++ b/devito/core/autotuning.py @@ -53,17 +53,40 @@ def autotune(operator, arguments, tunable): # Attempted block sizes ... mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable]) + time_dim = None + for i, d in mapper.items(): + if d.original_dim.is_Time: + time_dim = i + # ... Defaults (basic mode) - blocksizes = [OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']] + blocksizes = [OrderedDict([(i, v) for i in mapper if not mapper[i].original_dim.is_Time]) for v in options['at_blocksize']] # cubes # ... Always try the entire iteration space (degenerate block) datashape = [at_arguments[mapper[i].original_dim.symbolic_end.name] - at_arguments[mapper[i].original_dim.symbolic_start.name] for i in mapper] blocksizes.append(OrderedDict([(i, mapper[i].iteration.extent(0, j)) - for i, j in zip(mapper, datashape)])) + for i, j in zip(mapper, datashape)])) # degenerate block # ... More attempts if auto-tuning in aggressive mode if configuration.core['autotuning'] == 'aggressive': + last_dim = None + innermost = iterations[-1].dim + for k, v in mapper.items(): + if v.original_dim == innermost: + last_dim = (k, blocksizes[-1][k]) + blocksizes = more_heuristic_attempts(blocksizes) + if last_dim: + info_at("Extending the innermost dimension, %s <%s>" % (last_dim[0], last_dim[1])) + intermediate_blocks = [OrderedDict([(i, v) for i in mapper if not (mapper[i].original_dim.is_Time or mapper[i].original_dim == innermost)]) + for v in options['at_blocksize']] + intermediate_blocks = more_heuristic_attempts(intermediate_blocks) + blocksizes += cross_time_tiles(intermediate_blocks, last_dim[0], [last_dim[1]]) + # TODO: don't extend this: run generator for 2 dims, then extend that + + if time_dim: + blocksizes = cross_time_tiles(blocksizes, time_dim, [1, 2, 4, 8, 16]) + + # How many temporaries are allocated on the stack? # Will drop block sizes that might lead to a stack overflow functions = FindSymbols('symbolics').visit(operator.body + @@ -74,7 +97,14 @@ def autotune(operator, arguments, tunable): # Note: there is only a single loop over 'blocksize' because only # square blocks are tested timings = OrderedDict() + fastest, timing = None, float("inf") + unique = [] + for bs in blocksizes: + if bs in unique: + continue + unique.append(bs) + illegal = False for k, v in at_arguments.items(): if k in bs: @@ -115,12 +145,16 @@ def autotune(operator, arguments, tunable): operator.cfunction(*list(at_arguments.values())) elapsed = sum(operator.profiler.timings.values()) timings[tuple(bs.items())] = elapsed + if elapsed < timing: + fastest = tuple(bs.items()) + timing = elapsed info_at("Block shape <%s> took %f (s) in %d time steps" % (','.join('%d' % i for i in bs.values()), elapsed, timesteps)) try: - best = dict(min(timings, key=timings.get)) - info("Auto-tuned block shape: %s" % best) + # best = dict(min(timings, key=timings.get)) + best = dict(fastest) + info("Auto-tuned block shape: %s; time: %f (s)" % (best, timing)) except ValueError: info("Auto-tuning request, but couldn't find legal block sizes") return arguments @@ -140,6 +174,7 @@ def autotune(operator, arguments, tunable): def more_heuristic_attempts(blocksizes): # Ramp up to higher block sizes handle = OrderedDict([(i, options['at_blocksize'][-1]) for i in blocksizes[0]]) + # insert more cubes for i in range(3): new_bs = OrderedDict([(k, v*2) for k, v in handle.items()]) blocksizes.insert(blocksizes.index(handle) + 1, new_bs) @@ -152,22 +187,30 @@ def more_heuristic_attempts(blocksizes): handle.append(OrderedDict(list(bs.items())[:-1] + [list(i.items())[-1]])) # Some more shuffling for all block sizes for bs in list(blocksizes): - ncombs = len(bs) + ncombs = len(bs) # dimensions to tile over for i in range(ncombs): for j in combinations(bs, i+1): item = [(k, bs[k]*2 if k in j else v) for k, v in bs.items()] handle.append(OrderedDict(item)) - unique = [] - for i in blocksizes + handle: - if i not in unique: - unique.append(i) + return blocksizes + handle + + +def extend_dimension(blocksizes, dim, size): + return blocksizes + [OrderedDict([(dim, size) if dim == d else (d, s) for d, s in bs.items()]) for bs in blocksizes] + + +def cross_time_tiles(blocksizes, dim, tiles): + extended = [] + for bs in blocksizes: + for tile in tiles: + extended.append(OrderedDict([(dim, tile)] + list(bs.items()))) - return unique + return extended options = { - 'at_squeezer': 5, + 'at_squeezer': 17, 'at_blocksize': sorted({8, 16, 24, 32, 40, 64, 128}), 'at_stack_limit': resource.getrlimit(resource.RLIMIT_STACK)[0] / 4 } diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 28c7319c74..567a02ba74 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -3,25 +3,24 @@ from __future__ import absolute_import from collections import OrderedDict -from itertools import combinations import cgen import numpy as np import psutil from devito.cgen_utils import ccode -from devito.dimension import Dimension from devito.dle import fold_blockable_tree, unfold_blocked_tree from devito.dle.backends import (BasicRewriter, BlockingArg, dle_pass, omplang, simdinfo, get_simd_flag, get_simd_items) from devito.dse import promote_scalar_expressions from devito.exceptions import DLEException -from devito.ir.iet import (Block, Expression, Iteration, List, - PARALLEL, ELEMENTAL, REMAINDER, tagger, +from devito.ir.iet import (Block, Expression, Iteration, List, ELEMENTAL, FindNodes, FindSymbols, IsPerfectIteration, SubstituteExpression, Transformer, compose_nodes, - retrieve_iteration_tree, filter_iterations, copy_arrays) + retrieve_iteration_tree, filter_iterations, + copy_arrays, BlockIterations) from devito.logger import dle_warning +from devito.parameters import configuration from devito.tools import as_tuple, grouper, roundm from devito.types import Array @@ -35,7 +34,7 @@ def _pipeline(self, state): self._simdize(state) if self.params['openmp'] is True: self._ompize(state) - self._create_elemental_functions(state) + # self._create_elemental_functions(state) self._minimize_remainders(state) @dle_pass @@ -129,7 +128,12 @@ def _loop_blocking(self, nodes, state): blocked = OrderedDict() for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? - iterations = [i for i in tree if i.is_Parallel] + # FIXME: change mark_parallel ensure skewed loops are is_Parallel + if configuration['skew_factor']: + iterations = tree + else: + iterations = [i for i in tree if i.is_Parallel] + if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if len(iterations) <= 1: @@ -145,66 +149,16 @@ def _loop_blocking(self, nodes, state): # sequential loop (e.g., a timestepping loop) continue - # Decorate intra-block iterations with an IterationProperty - TAG = tagger(len(mapper)) - - # Build all necessary Iteration objects, individually. These will - # subsequently be composed to implement loop blocking. - inter_blocks = [] - intra_blocks = [] - remainders = [] - for i in iterations: - name = "%s%d_block" % (i.dim.name, len(mapper)) - - # Build Iteration over blocks - dim = blocked.setdefault(i, Dimension(name)) - block_size = dim.symbolic_size - iter_size = i.dim.symbolic_extent - start = i.limits[0] - i.offsets[0] - finish = i.dim.symbolic_end - i.offsets[1] - innersize = iter_size - (-i.offsets[0] + i.offsets[1]) - finish = finish - (innersize % block_size) - inter_block = Iteration([], dim, [start, finish, block_size], - properties=PARALLEL) - inter_blocks.append(inter_block) - - # Build Iteration within a block - start = inter_block.dim - finish = start + block_size - intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None, - properties=i.properties + (TAG, ELEMENTAL)) - intra_blocks.append(intra_block) - - # Build unitary-increment Iteration over the 'leftover' region. - # This will be used for remainder loops, executed when any - # dimension size is not a multiple of the block size. - start = inter_block.limits[1] - finish = i.dim.symbolic_end - i.offsets[1] - remainder = i._rebuild([], limits=[start, finish, 1], offsets=None) - remainders.append(remainder) - - # Build blocked Iteration nest - blocked_tree = compose_nodes(inter_blocks + intra_blocks + - [iterations[-1].nodes]) - - # Build remainder Iterations - remainder_trees = [] - for n in range(len(iterations)): - for c in combinations([i.dim for i in iterations], n + 1): - # First all inter-block Interations - nodes = [b._rebuild(properties=b.properties + (REMAINDER,)) - for b, r in zip(inter_blocks, remainders) - if r.dim not in c] - # Then intra-block or remainder, for each dim (in order) - properties = (REMAINDER, TAG, ELEMENTAL) - for b, r in zip(intra_blocks, remainders): - handle = r if b.dim in c else b - nodes.append(handle._rebuild(properties=properties)) - nodes.extend([iterations[-1].nodes]) - remainder_trees.append(compose_nodes(nodes)) + condition = lambda i: (i in iterations) + tag = len(mapper) + blocker = BlockIterations(tag, blocked, condition=condition) + intra_blocks = blocker.visit(root) + inter_blocks = blocker.inter_blocks + blocked = blocker.blocked + blocked_tree = compose_nodes(inter_blocks + [intra_blocks]) # Will replace with blocked loop tree - mapper[root] = List(body=[blocked_tree] + remainder_trees) + mapper[root] = List(body=[blocked_tree]) rebuilt = Transformer(mapper).visit(fold) diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py index b0524a982c..7e3051b309 100644 --- a/devito/dle/blocking_utils.py +++ b/devito/dle/blocking_utils.py @@ -202,9 +202,10 @@ class IterationFold(Iteration): is_IterationFold = True def __init__(self, nodes, dimension, limits, index=None, offsets=None, - properties=None, pragmas=None, uindices=None, folds=None): + properties=None, pragmas=None, uindices=None, folds=None, + skew=None): super(IterationFold, self).__init__(nodes, dimension, limits, index, offsets, - properties, uindices, pragmas) + properties, uindices, pragmas, skew=skew) self.folds = folds def __repr__(self): diff --git a/devito/dse/backends/__init__.py b/devito/dse/backends/__init__.py index 233a2b1c0a..a1fae4f100 100644 --- a/devito/dse/backends/__init__.py +++ b/devito/dse/backends/__init__.py @@ -1,6 +1,7 @@ from devito.dse.backends.common import * # noqa from devito.dse.backends.basic import BasicRewriter # noqa -from devito.dse.backends.advanced import AdvancedRewriter # noqa +from devito.dse.backends.advanced import (AdvancedRewriter, # noqa + SkewingRewriter) from devito.dse.backends.speculative import (SpeculativeRewriter, # noqa AggressiveRewriter, CustomRewriter) diff --git a/devito/dse/backends/advanced.py b/devito/dse/backends/advanced.py index 0ad2a2a7f6..f03e70bd65 100644 --- a/devito/dse/backends/advanced.py +++ b/devito/dse/backends/advanced.py @@ -2,10 +2,12 @@ from collections import OrderedDict +from devito.dimension import TimeDimension from devito.ir import clusterize from devito.dse.aliases import collect from devito.dse.backends import BasicRewriter, dse_pass -from devito.symbolics import Eq, estimate_cost, xreplace_constrained, iq_timeinvariant +from devito.parameters import configuration +from devito.symbolics import Eq, estimate_cost, xreplace_constrained, iq_timeinvariant, xreplace_indices from devito.dse.manipulation import (common_subexprs_elimination, collect_nested, compact_temporaries) from devito.types import Indexed, Scalar, Array @@ -162,3 +164,37 @@ def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)] + + +class SkewingRewriter(AdvancedRewriter): + + def _pipeline(self, state): + self._loop_skew(state) + self._extract_time_invariants(state) + self._eliminate_inter_stencil_redundancies(state) + self._eliminate_intra_stencil_redundancies(state) + self._factorize(state) + + @dse_pass + def _loop_skew(self, cluster, template, **kwargs): + skew_factor = -configuration['skew_factor'] + t, mapper = None, {} + skews = {} + + # FIXME: this is probably the wrong way to find the time dimension + for dim in cluster.stencil.dimensions: + if t is not None: + mapper[dim] = dim + skew_factor * t + skews[dim] = (skew_factor, t) + elif dim.is_Time: + if isinstance(dim, TimeDimension): + t = dim + elif isinstance(dim.parent, TimeDimension): + t = dim.parent + + if t is None: + return cluster + + cluster.skewed_loops = skews + processed = xreplace_indices(cluster.exprs, mapper) + return cluster.rebuild(processed) diff --git a/devito/dse/transformer.py b/devito/dse/transformer.py index de315b9332..633c832c7a 100644 --- a/devito/dse/transformer.py +++ b/devito/dse/transformer.py @@ -2,7 +2,7 @@ from devito.ir.clusters import ClusterGroup, groupby from devito.dse.backends import (BasicRewriter, AdvancedRewriter, SpeculativeRewriter, - AggressiveRewriter, CustomRewriter) + AggressiveRewriter, CustomRewriter, SkewingRewriter) from devito.exceptions import DSEException from devito.logger import dse_warning from devito.parameters import configuration @@ -13,12 +13,17 @@ modes = { 'basic': BasicRewriter, 'advanced': AdvancedRewriter, + 'skewing': SkewingRewriter, 'speculative': SpeculativeRewriter, 'aggressive': AggressiveRewriter } """The DSE transformation modes.""" +# FIXME: unsure what this should be +MAX_SKEW_FACTOR = 8 + configuration.add('dse', 'advanced', list(modes)) +configuration.add('skew_factor', 0, range(MAX_SKEW_FACTOR)) def rewrite(clusters, mode='advanced'): diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index d782fb9a65..a77953afb8 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -259,7 +259,7 @@ def clusterize(exprs, stencils): clusters = ClusterGroup() for target, pc in mapper.items(): exprs = [i for i in pc.exprs if i.lhs.is_Symbol or i.lhs == target] - clusters.append(PartialCluster(exprs, pc.stencil)) + clusters.append(PartialCluster(exprs, pc.stencil, pc.skewed_loops)) # Attempt grouping as many PartialClusters as possible together return groupby(clusters) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 41ddb95512..2cf173ee98 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -18,7 +18,7 @@ class PartialCluster(object): the embedded sequence of expressions are subjected to modifications. """ - def __init__(self, exprs, stencil): + def __init__(self, exprs, stencil, skewed_loops={}): """ Initialize a PartialCluster. @@ -29,6 +29,7 @@ def __init__(self, exprs, stencil): """ self._exprs = list(exprs) self._stencil = stencil + self.skewed_loops = skewed_loops @property def exprs(self): @@ -70,9 +71,10 @@ class Cluster(PartialCluster): """A Cluster is an immutable PartialCluster.""" - def __init__(self, exprs, stencil): + def __init__(self, exprs, stencil, skewed_loops={}): self._exprs = as_tuple(exprs) self._stencil = stencil.frozen + self.skewed_loops = skewed_loops @cached_property def trace(self): @@ -90,7 +92,7 @@ def rebuild(self, exprs): """ Build a new cluster with expressions ``exprs`` having same stencil as ``self``. """ - return Cluster(exprs, self.stencil) + return Cluster(exprs, self.stencil, self.skewed_loops) @PartialCluster.exprs.setter def exprs(self, val): @@ -120,7 +122,7 @@ def unfreeze(self): Return a new ClusterGroup in which all of ``self``'s Clusters have been promoted to PartialClusters. The ``atomics`` information is lost. """ - return ClusterGroup([PartialCluster(i.exprs, i.stencil) + return ClusterGroup([PartialCluster(i.exprs, i.stencil, i.skewed_loops) if isinstance(i, Cluster) else i for i in self]) def freeze(self): @@ -131,7 +133,7 @@ def freeze(self): clusters = ClusterGroup() for i in self: if isinstance(i, PartialCluster): - cluster = Cluster(i.exprs, i.stencil) + cluster = Cluster(i.exprs, i.stencil, i.skewed_loops) clusters.append(cluster) clusters.atomics[cluster] = self.atomics[i] else: diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 7d4f722371..c90f874fb4 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -256,7 +256,7 @@ class Iteration(Node): _traversable = ['nodes'] def __init__(self, nodes, dimension, limits, index=None, offsets=None, - properties=None, pragmas=None, uindices=None): + properties=None, pragmas=None, uindices=None, skew=None): # Ensure we deal with a list of Expression objects internally nodes = as_tuple(nodes) self.nodes = as_tuple([n if isinstance(n, Node) else Expression(n) @@ -290,6 +290,9 @@ def __init__(self, nodes, dimension, limits, index=None, offsets=None, self.uindices = as_tuple(uindices) assert all(isinstance(i, UnboundedIndex) for i in self.uindices) + # If there is no skewing factor, want to insert a dummy dimension + self.skew = skew if skew else (0, self.dim) + def __repr__(self): properties = "" if self.properties: diff --git a/devito/ir/iet/utils.py b/devito/ir/iet/utils.py index 5b8d7d47f8..3ac5a96906 100644 --- a/devito/ir/iet/utils.py +++ b/devito/ir/iet/utils.py @@ -112,7 +112,15 @@ def compose_nodes(nodes, retrieve=False): body = l.pop(-1) while l: handle = l.pop(-1) - body = handle._rebuild(body, **handle.args_frozen) + # The original code assumed nested loops only, and we wanted + # to handle [Iteration, Expression, Iteration, ...] + # See DevitoRewriter._loop_blocking() + # Easy to abuse (think of perfect loop nests) + # FIXME: can only handle one expression before an iteration + if isinstance(handle, Expression) and isinstance(body, Iteration): + body = (handle, body) + else: + body = handle._rebuild(body, **handle.args_frozen) tree.append(body) if retrieve is True: diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index 04c45ab382..96a1021294 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -12,11 +12,14 @@ import cgen as c import numpy as np +from sympy import Max, Min, Eq from devito.cgen_utils import blankline, ccode -from devito.dimension import LoweredDimension +from devito.dimension import LoweredDimension, Dimension from devito.exceptions import VisitorException -from devito.ir.iet.nodes import Iteration, Node, UnboundedIndex +from devito.ir.iet import tagger, SEQUENTIAL, PARALLEL, ELEMENTAL +from devito.ir.iet.nodes import Iteration, Node, UnboundedIndex, Expression, \ + List from devito.types import Scalar from devito.tools import as_tuple, filter_ordered, filter_sorted, flatten, ctypes_to_C @@ -24,7 +27,8 @@ __all__ = ['FindNodes', 'FindSections', 'FindSymbols', 'MapExpressions', 'IsPerfectIteration', 'SubstituteExpression', 'printAST', 'CGen', 'ResolveTimeStepping', 'Transformer', 'NestedTransformer', - 'FindAdjacentIterations', 'MergeOuterIterations', 'MapIteration'] + 'FindAdjacentIterations', 'MergeOuterIterations', 'MapIteration', + 'BlockIterations'] class Visitor(object): @@ -806,5 +810,73 @@ def visit_list(self, o): visit_tuple = visit_list +class BlockIterations(Visitor): + """ + Tile an iteration tree, given a condition. + """ + + def __init__(self, tag, blocked, condition=lambda _: True): + super(BlockIterations, self).__init__() + self.TAG = tagger(tag) + self.tag = tag + self.condition = condition + self.inter_blocks = [] + self.blocked = blocked + + def visit_Block(self, o): + #rebuilt = [self.visit(i) for i in o.children] + rebuilt = self.visit(o.children) + return o._rebuild(*rebuilt, **o.args_frozen) + + def visit_Iteration(self, o): + if not self.condition(o): + return o._rebuild(*self.visit(o.children), **o.args_frozen) + + # Do the actual blocking + name = "%s%d_block" % (o.dim.name, self.tag) + dim = self.blocked.setdefault(o, Dimension(name)) + block_size = dim.symbolic_size + + # FIXME: what if the time dimension doesn't start at 0? + # We subtract the skew here to straighten out the blocks + dim_start = o.limits[0] - o.offsets[0] + dim_finish = o.limits[1] - o.offsets[1] + + skew = o.skew[0] * o.skew[1] + skew_max = o.skew[0] * o.skew[1].symbolic_end + + outer_start = dim_start + skew + outer_finish = dim_finish + skew - skew_max + inter_block = Iteration([], dim, [outer_start, outer_finish, block_size]) + self.inter_blocks.append(inter_block) + + lower_bound = Max(inter_block.dim, dim_start) + inner_start = Scalar(name="%s_lb" % o.dim.name) + lb_expr = Expression(Eq(inner_start, lower_bound), np.dtype(np.int32)) + + upper_bound = Min(inter_block.dim + block_size, dim_finish) + inner_finish = Scalar(name="%s_ub" % o.dim.name) + ub_expr = Expression(Eq(inner_finish, upper_bound), np.dtype(np.int32)) + + if o.is_Parallel: + properties = [p for p in o.properties if p != SEQUENTIAL] + [PARALLEL, self.TAG] + else: + properties = o.properties + (self.TAG, ELEMENTAL) + + rebuilt = self.visit(o.children) + i = o._rebuild(*rebuilt, limits=[inner_start, inner_finish, 1], + offsets=None, properties=properties) + return List(body=(lb_expr, ub_expr, i)) + + def visit_Expression(self, o): + return o + + def visit_list(self, o): + rebuilt = [self.visit(i) for i in o] + return rebuilt + + visit_tuple = visit_list + + def printAST(node, verbose=True): return PrintAST(verbose=verbose).visit(node) diff --git a/devito/operator.py b/devito/operator.py index f30a05ec72..b842592b89 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -15,6 +15,7 @@ from devito.dse import rewrite from devito.exceptions import InvalidArgument, InvalidOperator from devito.function import Forward, Backward, CompositeFunction +from devito.ir import Cluster from devito.logger import bar, error, info from devito.ir.clusters import clusterize from devito.ir.iet import (Element, Expression, Callable, Iteration, List, @@ -192,6 +193,7 @@ def arguments(self, **kwargs): dim_sizes.update(dle_arguments) autotune = autotune and kwargs.pop('autotune', False) + kwargs.pop('autotune', False) # Make sure we've used all arguments passed if len(kwargs) > 0: @@ -300,8 +302,8 @@ def _autotune(self, arguments): best block sizes when loop blocking is in use.""" return arguments - def _schedule_expressions(self, clusters): - """Create an Iteartion/Expression tree given an iterable of + def _schedule_expressions(self, clusters: Cluster): + """Create an Iteration/Expression tree given an iterable of :class:`Cluster` objects.""" # Build the Iteration/Expression tree @@ -326,8 +328,16 @@ def _schedule_expressions(self, clusters): needed = entries[index:] # Build and insert the required Iterations - iters = [Iteration([], j.dim, j.dim.limits, offsets=j.ofs) for j in - needed] + iters = [] + for j in needed: + limits = j.dim.limits + if j.dim in i.skewed_loops: + skew_tuple = i.skewed_loops[j.dim] + skew = skew_tuple[0] * skew_tuple[1] + limits = (limits[0] - skew, limits[1] - skew, limits[2]) + else: + skew_tuple = 0 + iters.append(Iteration([], j.dim, limits, offsets=j.ofs, skew=skew_tuple)) body, tree = compose_nodes(iters + [expressions], retrieve=True) scheduling = OrderedDict(zip(needed, tree)) if root is None: diff --git a/devito/parameters.py b/devito/parameters.py index 180d7e5022..49a646e1d9 100644 --- a/devito/parameters.py +++ b/devito/parameters.py @@ -92,6 +92,7 @@ def name(self): 'DEVITO_LOGGING': 'log_level', 'DEVITO_FIRST_TOUCH': 'first_touch', 'DEVITO_DEBUG_COMPILER': 'debug_compiler', + 'DEVITO_SKEW_FACTOR': 'skew_factor', } configuration = Parameters("Devito-Configuration") diff --git a/examples/seismic/tti/tti_example.py b/examples/seismic/tti/tti_example.py index 87de1fac7a..07840847e0 100644 --- a/examples/seismic/tti/tti_example.py +++ b/examples/seismic/tti/tti_example.py @@ -64,7 +64,7 @@ def run(shape=(50, 50, 50), spacing=(20.0, 20.0, 20.0), tn=250.0, choices=['centered', 'shifted'], help="Choice of finite-difference kernel") parser.add_argument("-dse", "-dse", default="advanced", - choices=["noop", "basic", "advanced", + choices=["noop", "basic", "advanced", "skewing", "speculative", "aggressive"], help="Devito symbolic engine (DSE) mode") parser.add_argument("-dle", default="advanced", diff --git a/tests/test_dle.py b/tests/test_dle.py index b256900d29..0dcf05360e 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -9,6 +9,7 @@ from conftest import EVAL +from devito import configuration from devito.dle import transform from devito.dle.backends import DevitoRewriter as Rewriter from devito import Grid, Function, TimeFunction, Eq, Operator @@ -141,7 +142,7 @@ def _new_operator3(shape, time_order, **kwargs): # Allocate the grid and set initial condition # Note: This should be made simpler through the use of defaults - u = TimeFunction(name='u', grid=grid, time_order=1, space_order=2) + u = TimeFunction(name='u', grid=grid, time_order=time_order, space_order=2) u.data[0, :] = np.arange(reduce(mul, shape), dtype=np.int32).reshape(shape) # Derive the stencil according to devito conventions @@ -377,6 +378,61 @@ def test_cache_blocking_edge_cases_highorder(shape, blockshape): assert np.equal(wo_blocking.data, w_blocking.data).all() +@skipif_yask +@pytest.mark.parametrize("shape,blockshape", [ + ((25, 25, 46), (None, None, None)), + ((25, 25, 46), (7, None, None)), + ((25, 25, 46), (None, None, 7)), + ((25, 25, 46), (None, 7, None)), + ((25, 25, 46), (5, None, 7)), + ((25, 25, 46), (10, 3, None)), + ((25, 25, 46), (None, 7, 11)), + ((25, 25, 46), (8, 2, 4)), + ((25, 25, 46), (2, 4, 8)), + ((25, 25, 46), (4, 8, 2)), + ((25, 46), (None, 7)), + ((25, 46), (7, None)) +]) +def test_time_blocking_edge_cases(shape, blockshape): + prev = configuration['skew_factor'] if 'skew_factor' in configuration else 0 + configuration['skew_factor'] = 2 + wo_blocking, _ = _new_operator2(shape, time_order=2, dle='noop') + w_blocking, _ = _new_operator2(shape, time_order=2, dse='skewing', + dle=('blocking,openmp', + {'blockshape': blockshape, + 'blockinner': True})) + assert np.equal(wo_blocking.data, w_blocking.data).all() + configuration['skew_factor'] = prev + + +@skipif_yask +@pytest.mark.parametrize("shape,blockshape", [ + ((3, 3), (3, 4)), + ((4, 4), (3, 4)), + ((5, 5), (3, 4)), + ((6, 6), (3, 4)), + ((7, 7), (3, 4)), + ((8, 8), (3, 4)), + ((9, 9), (3, 4)), + ((10, 10), (3, 4)), + ((11, 11), (3, 4)), + ((12, 12), (3, 4)), + ((13, 13), (3, 4)), + ((14, 14), (3, 4)), + ((15, 15), (3, 4)) +]) +def test_time_blocking(shape, blockshape): + prev = configuration['skew_factor'] if 'skew_factor' in configuration else 0 + configuration['skew_factor'] = 2 + wo_blocking, _ = _new_operator3(shape, time_order=2, dle='noop') + w_blocking, _ = _new_operator3(shape, time_order=2, dse='skewing', + dle=('blocking,openmp', + {'blockshape': blockshape, + 'blockinner': True})) + assert np.equal(wo_blocking.data, w_blocking.data).all() + configuration['skew_factor'] = prev + + @skipif_yask @pytest.mark.parametrize('exprs,expected', [ # trivial 1D diff --git a/tests/test_dse.py b/tests/test_dse.py index 6e172bc842..303c21043a 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -5,7 +5,7 @@ import pytest from conftest import x, y, z, time, skipif_yask # noqa -from devito import Eq # noqa +from devito import configuration from devito.ir import Stencil, clusterize, TemporariesGraph from devito.dse import rewrite, common_subexprs_elimination, collect from devito.symbolics import (xreplace_constrained, iq_timeinvariant, iq_timevarying, @@ -62,7 +62,7 @@ def test_acoustic_rewrite_basic(): # TTI -def tti_operator(dse=False, space_order=4): +def tti_operator(dse=False, space_order=4, dle='advanced'): nrec = 101 t0 = 0.0 tn = 250. @@ -91,7 +91,8 @@ def tti_operator(dse=False, space_order=4): rec.coordinates.data[:, 1:] = src.coordinates.data[0, 1:] return AnisotropicWaveSolver(model, source=src, receiver=rec, - time_order=2, space_order=space_order, dse=dse) + time_order=2, space_order=space_order, dse=dse, + dle=dle) @pytest.fixture(scope="session") @@ -134,6 +135,32 @@ def test_tti_rewrite_basic(tti_nodse): assert np.allclose(tti_nodse[1].data, rec.data, atol=10e-3) +# Time-tiling vs dse=None,space-tiling +@skipif_yask +def test_dle_tiling(tti_nodse): + prev = configuration['skew_factor'] if 'skew_factor' in configuration else 0 + configuration['skew_factor'] = 2 + operator = tti_operator(dse='skewing', dle='advanced') + rec, u, v, _ = operator.forward() + + assert np.allclose(tti_nodse[0].data, v.data, atol=10e-3) + assert np.allclose(tti_nodse[1].data, rec.data, atol=10e-3) + configuration['skew_factor'] = prev + + +# Skewed only vs. dse=None,dle=advanced (i.e. space-tiling) +@skipif_yask +def test_tti_rewrite_skewing(tti_nodse): + prev = configuration['skew_factor'] if 'skew_factor' in configuration else 0 + configuration['skew_factor'] = 2 + operator = tti_operator(dse='skewing', dle=None) + rec, u, v, _ = operator.forward() + + assert np.allclose(tti_nodse[0].data, v.data, atol=10e-1) + assert np.allclose(tti_nodse[1].data, rec.data, atol=10e-1) + configuration['skew_factor'] = prev + + @skipif_yask def test_tti_rewrite_advanced(tti_nodse): operator = tti_operator(dse='advanced')