Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c1e7b0b
dse: introduce SkewingRewriter
nw0 Jan 12, 2018
3818e03
dse: skew: offsets loop bounds correspondingly
nw0 Jan 12, 2018
12579d6
operator: skewing: bugfix on loop bounds
nw0 Jan 19, 2018
5c082a6
test_dse: skewing: derive test (tti) for skewing
nw0 Jan 19, 2018
885f746
dle: _loop_blocking: change remainder loops to min
nw0 Jan 23, 2018
68520fa
dle: _loop_blocking: use iteration limit rather than symbolic end
nw0 Jan 23, 2018
277ce9c
Iteration: add skew param
nw0 Feb 12, 2018
26b156e
dse: skew: skewed_loops: break into factor, dim
nw0 Feb 12, 2018
ef9b723
dle: blocking: fix bounds on intra loop
nw0 Feb 12, 2018
4d6047b
dle: blocking: widen bounds on outer blocks
nw0 Feb 12, 2018
76f5037
dle: IterationFold: add skew parameter
nw0 Feb 12, 2018
0dfe57b
test_dse: skewed tiling test case
nw0 Feb 12, 2018
54704bc
dse: skew: skew before factorising
nw0 Feb 15, 2018
3c9b608
test_dle: time-tiling example
nw0 Feb 15, 2018
ccea7a1
dle: blocking: hoist upper bound from loop to support openmp
nw0 Feb 16, 2018
5f9b6eb
dse: skewing: read param skew_factor
nw0 Feb 16, 2018
93b7955
dle: blocking: detect skewing to enable time-tiling
nw0 Feb 16, 2018
38abdf9
dle: blocking: move to Visitor
nw0 Feb 19, 2018
246b8e1
dse: skewing: fix time dimension detection bug
nw0 Mar 5, 2018
50a40bb
dle: blocking: visitor: hoist lower bound to avoid fmax problems
nw0 Apr 9, 2018
f568386
operator: always pop autotune arg
nw0 May 30, 2018
6d4e644
autotuner: tweak for time-tiling
nw0 May 30, 2018
2683006
test_dse: force skewing factor for skewing test
nw0 May 30, 2018
8524985
visitor: BlockIterations: remember old loops
nw0 Jun 4, 2018
63abe73
test_dle: add test_time_blocking_edge_cases
nw0 Jun 4, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 54 additions & 11 deletions devito/core/autotuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,40 @@ def autotune(operator, arguments, tunable):

# Attempted block sizes ...
mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
time_dim = None
for i, d in mapper.items():
if d.original_dim.is_Time:
time_dim = i

# ... Defaults (basic mode)
blocksizes = [OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']]
blocksizes = [OrderedDict([(i, v) for i in mapper if not mapper[i].original_dim.is_Time]) for v in options['at_blocksize']] # cubes
# ... Always try the entire iteration space (degenerate block)
datashape = [at_arguments[mapper[i].original_dim.symbolic_end.name] -
at_arguments[mapper[i].original_dim.symbolic_start.name] for i in mapper]
blocksizes.append(OrderedDict([(i, mapper[i].iteration.extent(0, j))
for i, j in zip(mapper, datashape)]))
for i, j in zip(mapper, datashape)])) # degenerate block
# ... More attempts if auto-tuning in aggressive mode
if configuration.core['autotuning'] == 'aggressive':
last_dim = None
innermost = iterations[-1].dim
for k, v in mapper.items():
if v.original_dim == innermost:
last_dim = (k, blocksizes[-1][k])

blocksizes = more_heuristic_attempts(blocksizes)

if last_dim:
info_at("Extending the innermost dimension, %s <%s>" % (last_dim[0], last_dim[1]))
intermediate_blocks = [OrderedDict([(i, v) for i in mapper if not (mapper[i].original_dim.is_Time or mapper[i].original_dim == innermost)])
for v in options['at_blocksize']]
intermediate_blocks = more_heuristic_attempts(intermediate_blocks)
blocksizes += cross_time_tiles(intermediate_blocks, last_dim[0], [last_dim[1]])
# TODO: don't extend this: run generator for 2 dims, then extend that

if time_dim:
blocksizes = cross_time_tiles(blocksizes, time_dim, [1, 2, 4, 8, 16])


# How many temporaries are allocated on the stack?
# Will drop block sizes that might lead to a stack overflow
functions = FindSymbols('symbolics').visit(operator.body +
Expand All @@ -74,7 +97,14 @@ def autotune(operator, arguments, tunable):
# Note: there is only a single loop over 'blocksize' because only
# square blocks are tested
timings = OrderedDict()
fastest, timing = None, float("inf")
unique = []

for bs in blocksizes:
if bs in unique:
continue
unique.append(bs)

illegal = False
for k, v in at_arguments.items():
if k in bs:
Expand Down Expand Up @@ -115,12 +145,16 @@ def autotune(operator, arguments, tunable):
operator.cfunction(*list(at_arguments.values()))
elapsed = sum(operator.profiler.timings.values())
timings[tuple(bs.items())] = elapsed
if elapsed < timing:
fastest = tuple(bs.items())
timing = elapsed
info_at("Block shape <%s> took %f (s) in %d time steps" %
(','.join('%d' % i for i in bs.values()), elapsed, timesteps))

try:
best = dict(min(timings, key=timings.get))
info("Auto-tuned block shape: %s" % best)
# best = dict(min(timings, key=timings.get))
best = dict(fastest)
info("Auto-tuned block shape: %s; time: %f (s)" % (best, timing))
except ValueError:
info("Auto-tuning request, but couldn't find legal block sizes")
return arguments
Expand All @@ -140,6 +174,7 @@ def autotune(operator, arguments, tunable):
def more_heuristic_attempts(blocksizes):
# Ramp up to higher block sizes
handle = OrderedDict([(i, options['at_blocksize'][-1]) for i in blocksizes[0]])
# insert more cubes
for i in range(3):
new_bs = OrderedDict([(k, v*2) for k, v in handle.items()])
blocksizes.insert(blocksizes.index(handle) + 1, new_bs)
Expand All @@ -152,22 +187,30 @@ def more_heuristic_attempts(blocksizes):
handle.append(OrderedDict(list(bs.items())[:-1] + [list(i.items())[-1]]))
# Some more shuffling for all block sizes
for bs in list(blocksizes):
ncombs = len(bs)
ncombs = len(bs) # dimensions to tile over
for i in range(ncombs):
for j in combinations(bs, i+1):
item = [(k, bs[k]*2 if k in j else v) for k, v in bs.items()]
handle.append(OrderedDict(item))

unique = []
for i in blocksizes + handle:
if i not in unique:
unique.append(i)
return blocksizes + handle


def extend_dimension(blocksizes, dim, size):
return blocksizes + [OrderedDict([(dim, size) if dim == d else (d, s) for d, s in bs.items()]) for bs in blocksizes]


def cross_time_tiles(blocksizes, dim, tiles):
extended = []
for bs in blocksizes:
for tile in tiles:
extended.append(OrderedDict([(dim, tile)] + list(bs.items())))

return unique
return extended


options = {
'at_squeezer': 5,
'at_squeezer': 17,
'at_blocksize': sorted({8, 16, 24, 32, 40, 64, 128}),
'at_stack_limit': resource.getrlimit(resource.RLIMIT_STACK)[0] / 4
}
Expand Down
84 changes: 19 additions & 65 deletions devito/dle/backends/advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,24 @@
from __future__ import absolute_import

from collections import OrderedDict
from itertools import combinations

import cgen
import numpy as np
import psutil

from devito.cgen_utils import ccode
from devito.dimension import Dimension
from devito.dle import fold_blockable_tree, unfold_blocked_tree
from devito.dle.backends import (BasicRewriter, BlockingArg, dle_pass, omplang,
simdinfo, get_simd_flag, get_simd_items)
from devito.dse import promote_scalar_expressions
from devito.exceptions import DLEException
from devito.ir.iet import (Block, Expression, Iteration, List,
PARALLEL, ELEMENTAL, REMAINDER, tagger,
from devito.ir.iet import (Block, Expression, Iteration, List, ELEMENTAL,
FindNodes, FindSymbols, IsPerfectIteration,
SubstituteExpression, Transformer, compose_nodes,
retrieve_iteration_tree, filter_iterations, copy_arrays)
retrieve_iteration_tree, filter_iterations,
copy_arrays, BlockIterations)
from devito.logger import dle_warning
from devito.parameters import configuration
from devito.tools import as_tuple, grouper, roundm
from devito.types import Array

Expand All @@ -35,7 +34,7 @@ def _pipeline(self, state):
self._simdize(state)
if self.params['openmp'] is True:
self._ompize(state)
self._create_elemental_functions(state)
# self._create_elemental_functions(state)
self._minimize_remainders(state)

@dle_pass
Expand Down Expand Up @@ -129,7 +128,12 @@ def _loop_blocking(self, nodes, state):
blocked = OrderedDict()
for tree in retrieve_iteration_tree(fold):
# Is the Iteration tree blockable ?
iterations = [i for i in tree if i.is_Parallel]
# FIXME: change mark_parallel ensure skewed loops are is_Parallel
if configuration['skew_factor']:
iterations = tree
else:
iterations = [i for i in tree if i.is_Parallel]

if exclude_innermost:
iterations = [i for i in iterations if not i.is_Vectorizable]
if len(iterations) <= 1:
Expand All @@ -145,66 +149,16 @@ def _loop_blocking(self, nodes, state):
# sequential loop (e.g., a timestepping loop)
continue

# Decorate intra-block iterations with an IterationProperty
TAG = tagger(len(mapper))

# Build all necessary Iteration objects, individually. These will
# subsequently be composed to implement loop blocking.
inter_blocks = []
intra_blocks = []
remainders = []
for i in iterations:
name = "%s%d_block" % (i.dim.name, len(mapper))

# Build Iteration over blocks
dim = blocked.setdefault(i, Dimension(name))
block_size = dim.symbolic_size
iter_size = i.dim.symbolic_extent
start = i.limits[0] - i.offsets[0]
finish = i.dim.symbolic_end - i.offsets[1]
innersize = iter_size - (-i.offsets[0] + i.offsets[1])
finish = finish - (innersize % block_size)
inter_block = Iteration([], dim, [start, finish, block_size],
properties=PARALLEL)
inter_blocks.append(inter_block)

# Build Iteration within a block
start = inter_block.dim
finish = start + block_size
intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None,
properties=i.properties + (TAG, ELEMENTAL))
intra_blocks.append(intra_block)

# Build unitary-increment Iteration over the 'leftover' region.
# This will be used for remainder loops, executed when any
# dimension size is not a multiple of the block size.
start = inter_block.limits[1]
finish = i.dim.symbolic_end - i.offsets[1]
remainder = i._rebuild([], limits=[start, finish, 1], offsets=None)
remainders.append(remainder)

# Build blocked Iteration nest
blocked_tree = compose_nodes(inter_blocks + intra_blocks +
[iterations[-1].nodes])

# Build remainder Iterations
remainder_trees = []
for n in range(len(iterations)):
for c in combinations([i.dim for i in iterations], n + 1):
# First all inter-block Interations
nodes = [b._rebuild(properties=b.properties + (REMAINDER,))
for b, r in zip(inter_blocks, remainders)
if r.dim not in c]
# Then intra-block or remainder, for each dim (in order)
properties = (REMAINDER, TAG, ELEMENTAL)
for b, r in zip(intra_blocks, remainders):
handle = r if b.dim in c else b
nodes.append(handle._rebuild(properties=properties))
nodes.extend([iterations[-1].nodes])
remainder_trees.append(compose_nodes(nodes))
condition = lambda i: (i in iterations)
tag = len(mapper)
blocker = BlockIterations(tag, blocked, condition=condition)
intra_blocks = blocker.visit(root)
inter_blocks = blocker.inter_blocks
blocked = blocker.blocked
blocked_tree = compose_nodes(inter_blocks + [intra_blocks])

# Will replace with blocked loop tree
mapper[root] = List(body=[blocked_tree] + remainder_trees)
mapper[root] = List(body=[blocked_tree])

rebuilt = Transformer(mapper).visit(fold)

Expand Down
5 changes: 3 additions & 2 deletions devito/dle/blocking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,10 @@ class IterationFold(Iteration):
is_IterationFold = True

def __init__(self, nodes, dimension, limits, index=None, offsets=None,
properties=None, pragmas=None, uindices=None, folds=None):
properties=None, pragmas=None, uindices=None, folds=None,
skew=None):
super(IterationFold, self).__init__(nodes, dimension, limits, index, offsets,
properties, uindices, pragmas)
properties, uindices, pragmas, skew=skew)
self.folds = folds

def __repr__(self):
Expand Down
3 changes: 2 additions & 1 deletion devito/dse/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from devito.dse.backends.common import * # noqa
from devito.dse.backends.basic import BasicRewriter # noqa
from devito.dse.backends.advanced import AdvancedRewriter # noqa
from devito.dse.backends.advanced import (AdvancedRewriter, # noqa
SkewingRewriter)
from devito.dse.backends.speculative import (SpeculativeRewriter, # noqa
AggressiveRewriter,
CustomRewriter)
38 changes: 37 additions & 1 deletion devito/dse/backends/advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

from collections import OrderedDict

from devito.dimension import TimeDimension
from devito.ir import clusterize
from devito.dse.aliases import collect
from devito.dse.backends import BasicRewriter, dse_pass
from devito.symbolics import Eq, estimate_cost, xreplace_constrained, iq_timeinvariant
from devito.parameters import configuration
from devito.symbolics import Eq, estimate_cost, xreplace_constrained, iq_timeinvariant, xreplace_indices
from devito.dse.manipulation import (common_subexprs_elimination, collect_nested,
compact_temporaries)
from devito.types import Indexed, Scalar, Array
Expand Down Expand Up @@ -162,3 +164,37 @@ def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs):
processed = [e.xreplace(rules) for e in processed]

return alias_clusters + [cluster.rebuild(processed)]


class SkewingRewriter(AdvancedRewriter):

def _pipeline(self, state):
self._loop_skew(state)
self._extract_time_invariants(state)
self._eliminate_inter_stencil_redundancies(state)
self._eliminate_intra_stencil_redundancies(state)
self._factorize(state)

@dse_pass
def _loop_skew(self, cluster, template, **kwargs):
skew_factor = -configuration['skew_factor']
t, mapper = None, {}
skews = {}

# FIXME: this is probably the wrong way to find the time dimension
for dim in cluster.stencil.dimensions:
if t is not None:
mapper[dim] = dim + skew_factor * t
skews[dim] = (skew_factor, t)
elif dim.is_Time:
if isinstance(dim, TimeDimension):
t = dim
elif isinstance(dim.parent, TimeDimension):
t = dim.parent

if t is None:
return cluster

cluster.skewed_loops = skews
processed = xreplace_indices(cluster.exprs, mapper)
return cluster.rebuild(processed)
7 changes: 6 additions & 1 deletion devito/dse/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from devito.ir.clusters import ClusterGroup, groupby
from devito.dse.backends import (BasicRewriter, AdvancedRewriter, SpeculativeRewriter,
AggressiveRewriter, CustomRewriter)
AggressiveRewriter, CustomRewriter, SkewingRewriter)
from devito.exceptions import DSEException
from devito.logger import dse_warning
from devito.parameters import configuration
Expand All @@ -13,12 +13,17 @@
modes = {
'basic': BasicRewriter,
'advanced': AdvancedRewriter,
'skewing': SkewingRewriter,
'speculative': SpeculativeRewriter,
'aggressive': AggressiveRewriter
}
"""The DSE transformation modes."""

# FIXME: unsure what this should be
MAX_SKEW_FACTOR = 8

configuration.add('dse', 'advanced', list(modes))
configuration.add('skew_factor', 0, range(MAX_SKEW_FACTOR))


def rewrite(clusters, mode='advanced'):
Expand Down
2 changes: 1 addition & 1 deletion devito/ir/clusters/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def clusterize(exprs, stencils):
clusters = ClusterGroup()
for target, pc in mapper.items():
exprs = [i for i in pc.exprs if i.lhs.is_Symbol or i.lhs == target]
clusters.append(PartialCluster(exprs, pc.stencil))
clusters.append(PartialCluster(exprs, pc.stencil, pc.skewed_loops))

# Attempt grouping as many PartialClusters as possible together
return groupby(clusters)
Loading