diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f7bc2814..6e354846 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,11 +4,19 @@ stages:
 make:generic:
   stage: build
   script:
+    - git clone https://xgitlab.cels.anl.gov/argo/excit.git
+    - cd excit
     - ./autogen.sh
     - mkdir build
     - ./configure --prefix=`pwd`/build
     - make
     - make install
+    - cd ..
+    - ./autogen.sh
+    - mkdir build
+    - PKG_CONFIG_PATH=excit/build/lib/pkgconfig ./configure --prefix=`pwd`/build
+    - make
+    - make install
     - make check
   artifacts:
     when: on_failure
@@ -22,9 +30,17 @@ make:knl:
   stage: build
   script:
     - source /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64
+    - git clone https://xgitlab.cels.anl.gov/argo/excit.git
+    - cd excit
+    - ./autogen.sh
+    - mkdir build
+    - ./configure --prefix=`pwd`/build
+    - make
+    - make install
+    - cd ..
     - ./autogen.sh
     - mkdir build
-    - CC=icc CFLAGS="-mkl -xhost" ./configure --prefix=`pwd`/build --enable-benchmarks
+    - CC=icc CFLAGS="-mkl -xhost" PKG_CONFIG_PATH=excit/build/lib/pkgconfig ./configure --prefix=`pwd`/build --enable-benchmarks
     - make -j64
     - make install
     - make check
diff --git a/configure.ac b/configure.ac
index bf7aeec9..06260db0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -52,6 +52,9 @@ AM_CONDITIONAL([ADD_BENCHMARKS],[test "x$benchmarks" = xtrue])
 AC_CHECK_HEADERS(numa.h)
 AC_CHECK_LIB(numa, move_pages)
 
+# excit iterators
+PKG_CHECK_MODULES([EXCIT],[libexcit])
+
 # internal jemalloc
 ac_configure_args="$ac_configure_args \
 			'--with-jemalloc-prefix=jemk_aml_' \
diff --git a/src/Makefile.am b/src/Makefile.am
index f8fca5cf..b945d5c0 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,4 +1,4 @@
-AM_CPPFLAGS = -I$(top_srcdir)/jemalloc/include
+AM_CPPFLAGS = -I$(top_srcdir)/jemalloc/include @EXCIT_CFLAGS@
 lib_LTLIBRARIES = libaml.la
 
 ARENA_JEMALLOC_CSOURCES = arena_jemalloc.c
@@ -10,21 +10,33 @@ AREA_LINUX_CSOURCES = area_linux.c \
 
 AREA_POSIX_CSOURCES = area_posix.c
 
+LAYOUT_CSOURCES = layout.c \
+		  layout_dense.c \
+		  layout_pad.c \
+		  layout_reshape.c
+
 TILING_CSOURCES = tiling.c \
 		  tiling_1d.c \
 		  tiling_2d.c
 
+TILING_ND_CSOURCES = tiling_nd.c \
+		     tiling_nd_resize.c \
+		     tiling_nd_pad.c \
+		     tiling_nd_collapse.c
+
 BINDING_CSOURCES = binding.c \
 		   binding_single.c \
 		   binding_interleave.c
 
 DMA_CSOURCES = dma.c \
 	       dma_linux_par.c \
-	       dma_linux_seq.c
+	       dma_linux_seq.c \
+	       dma_layout.c
 
 SCRATCH_CSOURCES = scratch.c \
 		   scratch_seq.c \
-		   scratch_par.c
+		   scratch_par.c \
+		   scratch_double.c
 
 UTILS_CSOURCES = vector.c
 
@@ -34,12 +46,24 @@ LIBCSOURCES = aml.c area.c arena.c \
 	      $(AREA_LINUX_CSOURCES) \
 	      $(AREA_POSIX_CSOURCES) \
 	      $(TILING_CSOURCES) \
+	      $(TILING_ND_CSOURCES) \
 	      $(BINDING_CSOURCES) \
 	      $(DMA_CSOURCES) \
-	      $(SCRATCH_CSOURCES)
+	      $(SCRATCH_CSOURCES) \
+	      $(LAYOUT_CSOURCES) \
+	      copy.c
 
-LIBHSOURCES = aml.h
+LIBHSOURCES = aml.h \
+	      aml-layout.h \
+	      aml-layout-dense.h \
+	      aml-layout-pad.h \
+	      aml-layout-reshape.h \
+	      aml-tiling.h \
+	      aml-tiling-resize.h \
+	      aml-tiling-pad.h \
+	      aml-tiling-collapse.h \
+	      aml-copy.h
 
 libaml_la_SOURCES = $(LIBCSOURCES) $(LIBHSOURCES)
-libaml_la_LIBADD = -L$(top_srcdir)/jemalloc/lib/ -ljemalloc-aml
+libaml_la_LIBADD = -L$(top_srcdir)/jemalloc/lib/ -ljemalloc-aml @EXCIT_LIBS@
 include_HEADERS = $(LIBHSOURCES)
diff --git a/src/aml-copy.h b/src/aml-copy.h
new file mode 100644
index 00000000..22dd8933
--- /dev/null
+++ b/src/aml-copy.h
@@ -0,0 +1,192 @@
+#ifndef AML_COPY_H
+#define AML_COPY_H 1
+
+ /*******************************************************************************
+ * Hypervolume copy and transpose functions.
+ ******************************************************************************/
+
+/*
+ * Copies a (sub-)hypervolume to another (sub-)hypervolume.
+ * "d": number of dimensions.
+ * "dst": pointer to the destination hypervolume.
+ * "dst_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the destination hypervolume.
+ * "src": pointer to the source hypervolume.
+ * "src_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the source hypervolume.
+ * "elem_number": pointer to d values representing the number of elements
+ *                in each dimension of the (sub-)hypervolume to copy.
+ * "elem_size": size of memory elements.
+ * Returns 0 if successful; an error code otherwise.
+ */
+int aml_copy_nd(size_t d, void *dst, const size_t *dst_pitch,
+		const void *src, const size_t *src_pitch,
+		const size_t *elem_number, const size_t elem_size);
+/*
+ * Copies a (sub-)hypervolume to another (sub-)hypervolume while transposing.
+ * Reverse of aml_copy_rtnd.
+ * Example a[3][4][5] -> b[5][3][4] (C notation).
+ * "d": number of dimensions.
+ * "dst": pointer to the destination hypervolume.
+ * "dst_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the destination hypervolume.
+ * "src": pointer to the source hypervolume.
+ * "src_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the source hypervolume.
+ * "elem_number": pointer to d values representing the number of elements
+ *                in each dimension of the (sub-)hypervolume to copy.
+ * "elem_size": size of memory elements in the src hypervolume order.
+ * Returns 0 if successful; an error code otherwise.
+ */
+int aml_copy_tnd(size_t d, void *dst, const size_t *dst_pitch,
+		 const void *src, const size_t *src_pitch,
+		 const size_t *elem_number, const size_t elem_size);
+/*
+ * Copies a (sub-)hypervolume to another (sub-)hypervolume while transposing.
+ * Reverse of aml_copy_tnd.
+ * Example a[3][4][5] -> b[4][5][3] (C notation).
+ * "d": number of dimensions.
+ * "dst": pointer to the destination hypervolume.
+ * "dst_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the destination hypervolume.
+ * "src": pointer to the source hypervolume.
+ * "src_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the source hypervolume.
+ * "elem_number": pointer to d values representing the number of elements
+ *                in each dimension of the (sub-)hypervolume to copy.
+ * "elem_size": size of memory elements in the src hypervolume order.
+ * Returns 0 if successful; an error code otherwise.
+ */
+int aml_copy_rtnd(size_t d, void *dst, const size_t *dst_pitch,
+		  const void *src, const size_t *src_pitch,
+		  const size_t *elem_number, const size_t elem_size);
+
+/*
+ * Copies a (sub-)hypervolume to another (sub-)hypervolume while shuffling
+ * dimensions. Example a[4][2][3][5] -> b[5][4][3][2] (C notation).
+ * "d": number of dimensions.
+ * "target_dims": array of d dimension index representing the mapping
+ *                between the source dimensions and the target dimensions.
+ *                Example [3, 1, 0, 2]
+ * "dst": pointer to the destination hypervolume.
+ * "dst_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the destination hypervolume.
+ * "src": pointer to the source hypervolume.
+ * "src_pitch": pointer to d-1 pitch values representing the pitch
+ *              in each dimension of the source hypervolume.
+ * "elem_number": pointer to d values representing the number of elements
+ *                in each dimension of the (sub-)hypervolume to copy.
+ * "elem_size": size of memory elements in the src hypervolume order.
+ * Returns 0 if successful; an error code otherwise.
+ */
+int aml_copy_shnd(size_t d, const size_t *target_dims, void *dst,
+		  const size_t *dst_pitch, const void *src,
+		  const size_t *src_pitch, const size_t *elem_number,
+		  const size_t elem_size);
+/*
+ * Strided version of aml_copy_nd.
+ */
+int aml_copy_ndstr(size_t d, void *dst, const size_t *dst_pitch,
+		   const size_t *dst_stride, const void *src,
+		   const size_t *src_pitch, const size_t *src_stride,
+		   const size_t *elem_number, const size_t elem_size);
+/*
+ * Strided version of aml_copy_tnd.
+ */
+int aml_copy_tndstr(size_t d, void *dst, const size_t *dst_pitch,
+		    const size_t *dst_stride, const void *src,
+		    const size_t *src_pitch, const size_t *src_stride,
+		    const size_t *elem_number, const size_t elem_size);
+/*
+ * Strided version of aml_copy_rtnd.
+ */
+int aml_copy_rtndstr(size_t d, void *dst, const size_t *dst_pitch,
+		     const size_t *dst_stride, const void *src,
+		     const size_t *src_pitch, const size_t *src_stride,
+		     const size_t *elem_number, const size_t elem_size);
+/*
+ * Strided version of aml_copy_shnd.
+ */
+int aml_copy_shndstr(size_t d, const size_t *target_dims, void *dst,
+		     const size_t *dst_pitch, const size_t *dst_stride,
+		     const void *src, const size_t *src_pitch,
+		     const size_t *src_stride, const size_t *elem_number,
+		     const size_t elem_size);
+/*
+ * Version of aml_copy_nd using cumulative pitch.
+ */
+int aml_copy_nd_c(size_t d, void *dst, const size_t *cumul_dst_pitch,
+		  const void *src, const size_t *cumul_src_pitch,
+		  const size_t *elem_number, const size_t elem_size);
+/*
+ * Version of aml_copy_ndstr using cumulative pitch.
+ */
+int aml_copy_ndstr_c(size_t d, void *dst, const size_t *dst_pitch,
+		     const size_t *cumul_dst_stride, const void *src,
+		     const size_t *src_pitch, const size_t *cumul_src_stride,
+		     const size_t *elem_number, const size_t elem_size);
+/*
+ * Version of aml_copy_nd using cumulative pitch.
+ */
+int aml_copy_tnd_c(size_t d, void *dst, const size_t *cumul_dst_pitch,
+		   const void *src, const size_t *cumul_src_pitch,
+		   const size_t *elem_number, const size_t elem_size);
+/*
+ * Version of aml_copy_nd using cumulative pitch.
+ */
+int aml_copy_rtnd_c(size_t d, void *dst, const size_t *cumul_dst_pitch,
+		    const void *src, const size_t *cumul_src_pitch,
+		    const size_t *elem_number, const size_t elem_size);
+/*
+ * Version of aml_copy_shnd using cumulative pitch.
+ */
+int aml_copy_shnd_c(size_t d, const size_t *target_dims, void *dst,
+		    const size_t *cumul_dst_pitch, const void *src,
+		    const size_t *cumul_src_pitch, const size_t *elem_number,
+		    const size_t elem_size);
+/*
+ * Version of aml_copy_tndstr using cumulative pitch.
+ */
+int aml_copy_tndstr_c(size_t d, void *dst, const size_t *cumul_dst_pitch,
+		      const size_t *dst_stride, const void *src,
+		      const size_t *cumul_src_pitch, const size_t *src_stride,
+		      const size_t *elem_number, const size_t elem_size);
+/*
+ * Version of aml_copy_rtndstr using cumulative pitch.
+ */
+int aml_copy_rtndstr_c(size_t d, void *dst, const size_t *cumul_dst_pitch,
+		       const size_t *dst_stride, const void *src,
+		       const size_t *cumul_src_pitch, const size_t *src_stride,
+		       const size_t *elem_number, const size_t elem_size);
+/*
+ * Version of aml_copy_shndstr using cumulative pitch.
+ */
+int aml_copy_shndstr_c(size_t d, const size_t *target_dims, void *dst,
+		       const size_t *cumul_dst_pitch, const size_t *dst_stride,
+		       const void *src, const size_t *cumul_src_pitch,
+		       const size_t *src_stride, const size_t *elem_number,
+		       const size_t elem_size);
+
+ /*******************************************************************************
+ * Generic building block API: Native version
+ * Native means using AML-internal layouts.
+ ******************************************************************************/
+
+int aml_copy_layout_native(struct aml_layout *dst,
+			   const struct aml_layout *src);
+int aml_copy_layout_transform_native(struct aml_layout *dst,
+				     const struct aml_layout *src,
+				     const size_t *target_dims);
+int aml_copy_layout_generic(struct aml_layout *dst,
+			    const struct aml_layout *src);
+int aml_copy_layout_transform_generic(struct aml_layout *dst,
+				      const struct aml_layout *src,
+				      const size_t *target_dims);
+int aml_copy_layout_transpose_native(struct aml_layout *dst, const struct aml_layout *src);
+int aml_copy_layout_reverse_transpose_native(struct aml_layout *dst,
+					     const struct aml_layout *src);
+int aml_copy_layout_transpose_generic(struct aml_layout *dst, const struct aml_layout *src);
+int aml_copy_layout_reverse_transpose_generic(struct aml_layout *dst,
+					      const struct aml_layout *src);
+
+#endif
diff --git a/src/aml-dma-layout.h b/src/aml-dma-layout.h
new file mode 100644
index 00000000..4e3e68a3
--- /dev/null
+++ b/src/aml-dma-layout.h
@@ -0,0 +1,41 @@
+#ifndef AML_DMA_LAYOUT_H
+#define AML_DMA_LAYOUT_H 1
+
+/*******************************************************************************
+ * Layout aware DMA
+ * DMA using layouts as source and destination.
+ ******************************************************************************/
+
+extern struct aml_dma_ops aml_dma_ops_layout;
+
+struct aml_dma_request_layout {
+	int type;
+	struct aml_layout *dest;
+	struct aml_layout *src;
+};
+
+typedef int (*aml_dma_operator)(struct aml_layout *, struct aml_layout *, void*);
+struct aml_dma_layout {
+	struct aml_vector requests;
+	pthread_mutex_t lock;
+	aml_dma_operator do_work;
+	void *work_arg;
+};
+
+#define AML_DMA_LAYOUT_DECL(name) \
+	struct aml_dma_layout __ ##name## _inner_data; \
+	struct aml_dma name = { \
+		&aml_dma_ops_layout, \
+		(struct aml_dma_data *)&__ ## name ## _inner_data, \
+	};
+
+#define AML_DMA_LAYOUT_ALLOCSIZE \
+	(sizeof(struct aml_dma_layout) + \
+	 sizeof(struct aml_dma))
+
+int aml_dma_layout_create(struct aml_dma **dma, ...);
+int aml_dma_layout_init(struct aml_dma *dma, ...);
+int aml_dma_layout_vinit(struct aml_dma *dma, va_list args);
+int aml_dma_layout_destroy(struct aml_dma *dma);
+
+#endif
diff --git a/src/aml-layout-dense.h b/src/aml-layout-dense.h
new file mode 100644
index 00000000..d7782cdd
--- /dev/null
+++ b/src/aml-layout-dense.h
@@ -0,0 +1,75 @@
+#ifndef AML_LAYOUT_DENSE_H
+#define AML_LAYOUT_DENSE_H 1
+
+#include <stdarg.h>
+
+/*******************************************************************************
+ * Native Layout Operators.
+ ******************************************************************************/
+
+/* Layout: describes how a  multi-dimensional dense data structure is collapsed
+ * into a linear (and contiguous) virtual address range.
+ * "ptr": base pointer of the address range
+ * "ndims": number of dimensions
+ * "dims": dimensions, in element size, of the data structure, by order of
+ *         appearance in memory.
+ * "stride": offset between elements of the same dimension.
+ * "pitch": distances between two elements of the next dimension (or total
+            dimension of the layout in this dimension).
+ * "cpitch": cumulative distances between two elements in the same dimension
+ *           (pitch[0] is the element size in bytes).
+ */
+struct aml_layout_data_native {
+	void *ptr;
+	size_t ndims;
+	size_t *dims;
+	size_t *stride;
+	size_t *pitch;
+	size_t *cpitch;
+};
+
+#define AML_LAYOUT_NATIVE_ALLOCSIZE(ndims) (sizeof(struct aml_layout) +\
+					sizeof(struct aml_layout_data_native) +\
+					(ndims * 4 + 1) * sizeof(size_t))
+
+#define AML_LAYOUT_NATIVE_DECL(name, ndims) \
+	size_t __ ##name## _inner_data[ndims * 4 + 1]; \
+	struct aml_layout_data_native __ ##name## _inner_struct = { \
+		NULL, \
+		ndims, \
+		__ ##name## _inner_data, \
+		__ ##name## _inner_data + ndims, \
+		__ ##name## _inner_data + 2 * ndims, \
+		__ ##name## _inner_data + 3 * ndims, \
+	}; \
+	struct aml_layout name = { \
+		0, \
+		NULL, \
+		(struct aml_layout_data *)& __ ##name## _inner_struct, \
+	};
+
+int aml_layout_native_struct_init(struct aml_layout *l, size_t ndims,
+				  void *data);
+int aml_layout_native_ainit(struct aml_layout *l, uint64_t tags, void *ptr,
+			    const size_t element_size, size_t ndims,
+			    const size_t *dims, const size_t *stride,
+			    const size_t *pitch);
+int aml_layout_native_vinit(struct aml_layout *l, uint64_t tags, void *ptr,
+			    const size_t element_size, size_t ndims,
+			    va_list data);
+int aml_layout_native_init(struct aml_layout *l, uint64_t tags, void *ptr,
+			   const size_t element_size, size_t ndims, ...);
+int aml_layout_native_acreate(struct aml_layout **l, uint64_t tags, void *ptr,
+			      const size_t element_size, size_t ndims,
+			      const size_t *dims, const size_t *stride,
+			      const size_t *pitch);
+int aml_layout_native_vcreate(struct aml_layout **l, uint64_t tags, void *ptr,
+			      const size_t element_size, size_t ndims,
+			      va_list data);
+int aml_layout_native_create(struct aml_layout **l, uint64_t tags, void *ptr,
+			     const size_t element_size, size_t ndims, ...);
+
+extern struct aml_layout_ops aml_layout_column_ops;
+extern struct aml_layout_ops aml_layout_row_ops;
+
+#endif
diff --git a/src/aml-layout-pad.h b/src/aml-layout-pad.h
new file mode 100644
index 00000000..6705375f
--- /dev/null
+++ b/src/aml-layout-pad.h
@@ -0,0 +1,58 @@
+#ifndef AML_LAYOUT_PAD_H
+#define AML_LAYOUT_PAD_H 1
+
+#include <stdarg.h>
+
+struct aml_layout_data_pad {
+	struct aml_layout *target;
+	size_t ndims;
+	size_t element_size;
+	size_t *dims;
+	size_t *target_dims;
+	void *neutral;
+};
+
+
+#define AML_LAYOUT_PAD_ALLOCSIZE(ndims, neutral_size) ( \
+	sizeof(struct aml_layout) + \
+	sizeof(struct aml_layout_data_pad) + \
+	2 * ndims * sizeof(size_t) + \
+	neutral_size )
+
+#define AML_LAYOUT_PAD_DECL(name, ndims, neutral_size) \
+	uint8_t __ ##name## _inner_data[2 * ndims * sizeof(size_t) + \
+					neutral_size ]; \
+	struct aml_layout_data_pad __ ##name## _inner_struct = { \
+		NULL, \
+		ndims, \
+		neutral_size, \
+		(size_t *) __ ##name## _inner_data, \
+		(size_t *) (__ ##name## _inner_data + ndims * sizeof(size_t)), \
+		(void *) (__ ##name## _inner_data + 2 * ndims * sizeof(size_t)) \
+	}; \
+	struct aml_layout name = { \
+		0, \
+		NULL, \
+		(struct aml_layout_data *)& __ ##name## _inner_struct \
+	};
+
+int aml_layout_pad_struct_init(struct aml_layout *l, size_t ndims,
+			       size_t element_size, void *data);
+int aml_layout_pad_ainit(struct aml_layout *l, uint64_t tags,
+			 struct aml_layout *target, const size_t *dims,
+			 void *neutral);
+int aml_layout_pad_vinit(struct aml_layout *l, uint64_t tags,
+			 struct aml_layout *target, va_list data);
+int aml_layout_pad_init(struct aml_layout *l, uint64_t tags,
+			struct aml_layout *target, ...);
+int aml_layout_pad_acreate(struct aml_layout **l, uint64_t tags,
+			   struct aml_layout *target, const size_t *dims,
+			   void *neutral);
+int aml_layout_pad_vcreate(struct aml_layout **l, uint64_t tags,
+			   struct aml_layout *target, va_list data);
+int aml_layout_pad_create(struct aml_layout **l, uint64_t tags,
+			  struct aml_layout *target, ...);
+
+extern struct aml_layout_ops aml_layout_pad_column_ops;
+extern struct aml_layout_ops aml_layout_pad_row_ops;
+#endif
diff --git a/src/aml-layout-reshape.h b/src/aml-layout-reshape.h
new file mode 100644
index 00000000..c8207c0c
--- /dev/null
+++ b/src/aml-layout-reshape.h
@@ -0,0 +1,60 @@
+#ifndef AML_LAYOUT_RESHAPE_H
+#define AML_LAYOUT_RESHAPE_H
+
+#include <stdarg.h>
+
+struct aml_layout_data_reshape {
+	struct aml_layout *target;
+	size_t ndims;
+	size_t target_ndims;
+	size_t *dims;
+	size_t *coffsets;
+	size_t *target_dims;
+	size_t *target_coffsets;
+};
+
+#define AML_LAYOUT_RESHAPE_ALLOCSIZE(ndims, target_ndims) ( \
+	sizeof(struct aml_layout) + \
+        sizeof(struct aml_layout_data_reshape) + \
+	2 * ndims * sizeof(size_t) + \
+	target_ndims * sizeof(size_t) )
+
+#define AML_LAYOUT_RESHAPE_DECL(name, ndims, target_ndims) \
+	size_t __ ##name## _inner_data[ 2 * ndims + target_ndims]; \
+	struct aml_layout_data_reshape __ ##name## _inner_struct = { \
+		NULL, \
+		ndims, \
+		target_ndims, \
+		__ ##name## _inner_data, \
+		__ ##name## _inner_data + ndims \
+		__ ##name## _inner_data + 2 * ndims \
+	}; \
+	struct aml_layout name = { \
+		0, \
+		NULL, \
+		(struct aml_layout_data *)& __ ##name## _inner_struct \
+	};
+
+int aml_layout_reshape_struct_init(struct aml_layout *l, size_t ndims,
+				   void *data);
+int aml_layout_reshape_ainit(struct aml_layout *l, uint64_t tags,
+			     struct aml_layout *target, size_t ndims,
+			     const size_t *dims);
+int aml_layout_reshape_vinit(struct aml_layout *l, uint64_t tags,
+			     struct aml_layout *target, size_t ndims,
+			     va_list data);
+int aml_layout_reshape_init(struct aml_layout *l, uint64_t tags,
+			    struct aml_layout *target, size_t ndims, ...);
+int aml_layout_reshape_acreate(struct aml_layout **l, uint64_t tags,
+			       struct aml_layout *target, size_t ndims,
+			       const size_t *dims);
+int aml_layout_reshape_vcreate(struct aml_layout **l, uint64_t tags,
+			       struct aml_layout *target, size_t ndims,
+			       va_list data);
+int aml_layout_reshape_create(struct aml_layout **l, uint64_t tags,
+			      struct aml_layout *target, size_t ndims, ...);
+
+extern struct aml_layout_ops aml_layout_reshape_column_ops;
+extern struct aml_layout_ops aml_layout_reshape_row_ops;
+
+#endif
diff --git a/src/aml-layout.h b/src/aml-layout.h
new file mode 100644
index 00000000..44327472
--- /dev/null
+++ b/src/aml-layout.h
@@ -0,0 +1,86 @@
+#ifndef AML_LAYOUT_H
+#define AML_LAYOUT_H 1
+
+#include <stdarg.h>
+
+/*******************************************************************************
+ * Data Layout Management:
+ ******************************************************************************/
+
+struct aml_layout;
+struct aml_layout_data;
+
+/*******************************************************************************
+ * Generic layout, with support for sparsity and strides.
+ ******************************************************************************/
+
+/* Layout type tags. Defined as the bit offset to set to one. */
+#define AML_TYPE_LAYOUT_ORDER (1 << 0)
+#define AML_TYPE_LAYOUT_MAX (1 << 1)
+
+#define AML_TYPE_LAYOUT_ROW_ORDER 1
+#define AML_TYPE_LAYOUT_COLUMN_ORDER 0
+
+#define AML_TYPE_GET(tags, bit) (tags & bit)
+#define AML_TYPE_CLEAR(tags, bit) (tags &= ~bit)
+#define AML_TYPE_SET(tags, bit, value) do { \
+	AML_TYPE_CLEAR(tags, bit); \
+	if(value) tags |= bit;} while(0)
+
+
+struct aml_layout_ops {
+	void *(*deref)(const struct aml_layout_data *, va_list coords);
+	void *(*aderef)(const struct aml_layout_data *, const size_t *coords);
+	void *(*aderef_column)(const struct aml_layout_data *,
+			       const size_t *coords);
+	int (*order)(const struct aml_layout_data *);
+	int (*dims)(const struct aml_layout_data *, va_list dim_ptrs);
+	int (*adims)(const struct aml_layout_data *, size_t *dims);
+	int (*adims_column)(const struct aml_layout_data *, size_t *dims);
+        size_t (*ndims)(const struct aml_layout_data *);
+        size_t (*element_size)(const struct aml_layout_data *);
+        struct aml_layout * (*reshape)(const struct aml_layout_data *,
+				       size_t ndims, va_list dims);
+        struct aml_layout * (*areshape)(const struct aml_layout_data *,
+					size_t ndims, const size_t *dims);
+        struct aml_layout * (*slice)(const struct aml_layout_data *,
+				     va_list dims);
+	struct aml_layout * (*aslice)(const struct aml_layout_data *,
+				      const size_t *offsets, const size_t *dims,
+				      const size_t *strides);
+	struct aml_layout * (*aslice_column)(const struct aml_layout_data *,
+					     const size_t *offsets,
+					     const size_t *dims,
+					     const size_t *strides);
+};
+
+struct aml_layout {
+	uint64_t tags;
+	struct aml_layout_ops *ops;
+	struct aml_layout_data *data;
+};
+
+void *aml_layout_deref(const struct aml_layout *l, ...);
+void *aml_layout_aderef(const struct aml_layout *l, const size_t *coords);
+void *aml_layout_aderef_column(const struct aml_layout *l,
+			       const size_t *coords);
+int aml_layout_order(const struct aml_layout *l);
+int aml_layout_dims(const struct aml_layout *l, ...);
+int aml_layout_adims(const struct aml_layout *l, size_t *dims);
+int aml_layout_adims_column(const struct aml_layout *l, size_t *dims);
+size_t aml_layout_ndims(const struct aml_layout *l);
+size_t aml_layout_element_size(const struct aml_layout *l);
+struct aml_layout * aml_layout_areshape(const struct aml_layout *l,
+					size_t ndims, const size_t *dims);
+struct aml_layout * aml_layout_reshape(const struct aml_layout *l,
+				       size_t ndims, ...);
+struct aml_layout * aml_layout_slice(const struct aml_layout *l, ...);
+struct aml_layout * aml_layout_aslice(const struct aml_layout *l,
+				      const size_t *offsets, const size_t *dims,
+				      const size_t *strides);
+struct aml_layout * aml_layout_aslice_column(const struct aml_layout *l,
+					     const size_t *offsets,
+					     const size_t *dims,
+					     const size_t *strides);
+
+#endif
diff --git a/src/aml-scratch-double.h b/src/aml-scratch-double.h
new file mode 100644
index 00000000..791dcecf
--- /dev/null
+++ b/src/aml-scratch-double.h
@@ -0,0 +1,56 @@
+#ifndef AML_SCRATCH_DOUBLE_H
+#define AML_SCRATCH_DOUBLE_H 1
+
+/*******************************************************************************
+ * Sequential scratchpad API:
+ * Scratchpad uses calling thread to trigger asynchronous dma movements.
+ ******************************************************************************/
+
+extern struct aml_scratch_ops aml_scratch_double_ops;
+
+struct aml_scratch_request_double {
+	int type;
+	struct aml_dma *dma;
+	struct aml_layout *src;
+	int srcid;
+	struct aml_layout *dest;
+	int dstid;
+	pthread_t thread;
+};
+
+struct aml_scratch_double_data {
+	struct aml_tiling_nd *src_tiling;
+	struct aml_tiling_nd *dest_tiling;
+	struct aml_dma *push_dma;
+	struct aml_dma *pull_dma;
+	struct aml_vector tilemap;
+	struct aml_vector requests;
+	pthread_mutex_t lock;
+};
+
+struct aml_scratch_double_ops {
+	void *(*do_thread)(void *);
+};
+
+struct aml_scratch_double {
+	struct aml_scratch_double_ops ops;
+	struct aml_scratch_double_data data;
+};
+
+#define AML_SCRATCH_DOUBLE_DECL(name) \
+	struct aml_scratch_double __ ##name## _inner_data; \
+	struct aml_scratch name = { \
+		&aml_scratch_double_ops, \
+		(struct aml_scratch_data *)&__ ## name ## _inner_data, \
+	};
+
+#define AML_SCRATCH_DOUBLE_ALLOCSIZE \
+	(sizeof(struct aml_scratch_double) + \
+	 sizeof(struct aml_scratch))
+
+int aml_scratch_double_create(struct aml_scratch **scratch, ...);
+int aml_scratch_double_init(struct aml_scratch *scratch, ...);
+int aml_scratch_double_vinit(struct aml_scratch *scratch, va_list args);
+int aml_scratch_double_destroy(struct aml_scratch *scratch);
+
+#endif
diff --git a/src/aml-tiling-collapse.h b/src/aml-tiling-collapse.h
new file mode 100644
index 00000000..348ea0b6
--- /dev/null
+++ b/src/aml-tiling-collapse.h
@@ -0,0 +1,40 @@
+#ifndef AML_TILING_COLLAPSE_H
+#define AML_TILING_COLLAPSE_H
+
+#include <stdarg.h>
+
+struct aml_tiling_nd_data_collapse {
+	const struct aml_layout *l;
+	size_t ndims;
+	size_t *tile_dims;
+	size_t *dims;
+	size_t *border_tile_dims;
+};
+
+#define AML_TILING_COLLAPSE_ALLOCSIZE(ndims) (sizeof(struct aml_tiling_nd) +\
+					    sizeof(struct aml_tiling_nd_data_collapse) +\
+					    (ndims * 3) * sizeof(size_t))
+
+int aml_tiling_nd_collapse_struct_init(struct aml_tiling_nd *t, size_t ndims,
+				     void *data);
+int aml_tiling_nd_collapse_ainit(struct aml_tiling_nd *t, uint64_t tags,
+			       const struct aml_layout *l, size_t ndims,
+			       const size_t *tile_dims);
+int aml_tiling_nd_collapse_vinit(struct aml_tiling_nd *t, uint64_t tags,
+			       const struct aml_layout *l, size_t ndims,
+			       va_list data);
+int aml_tiling_nd_collapse_init(struct aml_tiling_nd *t, uint64_t tags,
+			      const struct aml_layout *l, size_t ndims, ...);
+int aml_tiling_nd_collapse_acreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 const size_t *tile_dims);
+int aml_tiling_nd_collapse_vcreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 va_list data);
+int aml_tiling_nd_collapse_create(struct aml_tiling_nd **t, uint64_t tags,
+				const struct aml_layout *l, size_t ndims, ...);
+
+extern struct aml_tiling_nd_ops aml_tiling_nd_collapse_column_ops;
+extern struct aml_tiling_nd_ops aml_tiling_nd_collapse_row_ops;
+
+#endif
diff --git a/src/aml-tiling-pad.h b/src/aml-tiling-pad.h
new file mode 100644
index 00000000..46f23d86
--- /dev/null
+++ b/src/aml-tiling-pad.h
@@ -0,0 +1,44 @@
+#ifndef AML_TILING_PAD_H
+#define AML_TILING_PAD_H
+
+#include <stdarg.h>
+
+struct aml_tiling_nd_data_pad {
+	const struct aml_layout *l;
+	size_t ndims;
+	size_t *tile_dims;
+	size_t *dims;
+	size_t *border_tile_dims;
+	size_t *pad;
+	void *neutral;
+};
+
+#define AML_TILING_PAD_ALLOCSIZE(ndims, neutral_size) ( \
+	sizeof(struct aml_tiling_nd) + \
+	sizeof(struct aml_tiling_nd_data_pad) + \
+	(ndims * 4) * sizeof(size_t) + \
+	neutral_size )
+
+int aml_tiling_nd_pad_struct_init(struct aml_tiling_nd *t, size_t ndims,
+				     void *data);
+int aml_tiling_nd_pad_ainit(struct aml_tiling_nd *t, uint64_t tags,
+			       const struct aml_layout *l, size_t ndims,
+			       const size_t *tile_dims, void *neutral);
+int aml_tiling_nd_pad_vinit(struct aml_tiling_nd *t, uint64_t tags,
+			       const struct aml_layout *l, size_t ndims,
+			       va_list data);
+int aml_tiling_nd_pad_init(struct aml_tiling_nd *t, uint64_t tags,
+			      const struct aml_layout *l, size_t ndims, ...);
+int aml_tiling_nd_pad_acreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 const size_t *tile_dims, void *neutral);
+int aml_tiling_nd_pad_vcreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 va_list data);
+int aml_tiling_nd_pad_create(struct aml_tiling_nd **t, uint64_t tags,
+				const struct aml_layout *l, size_t ndims, ...);
+
+extern struct aml_tiling_nd_ops aml_tiling_nd_pad_column_ops;
+extern struct aml_tiling_nd_ops aml_tiling_nd_pad_row_ops;
+
+#endif
diff --git a/src/aml-tiling-resize.h b/src/aml-tiling-resize.h
new file mode 100644
index 00000000..eb7fee8e
--- /dev/null
+++ b/src/aml-tiling-resize.h
@@ -0,0 +1,40 @@
+#ifndef AML_TILING_RESIZE_H
+#define AML_TILING_RESIZE_H
+
+#include <stdarg.h>
+
+struct aml_tiling_nd_data_resize {
+	const struct aml_layout *l;
+	size_t ndims;
+	size_t *tile_dims;
+	size_t *dims;
+	size_t *border_tile_dims;
+};
+
+#define AML_TILING_RESIZE_ALLOCSIZE(ndims) (sizeof(struct aml_tiling_nd) +\
+					    sizeof(struct aml_tiling_nd_data_resize) +\
+					    (ndims * 3) * sizeof(size_t))
+
+int aml_tiling_nd_resize_struct_init(struct aml_tiling_nd *t, size_t ndims,
+				     void *data);
+int aml_tiling_nd_resize_ainit(struct aml_tiling_nd *t, uint64_t tags,
+			       const struct aml_layout *l, size_t ndims,
+			       const size_t *tile_dims);
+int aml_tiling_nd_resize_vinit(struct aml_tiling_nd *t, uint64_t tags,
+			       const struct aml_layout *l, size_t ndims,
+			       va_list data);
+int aml_tiling_nd_resize_init(struct aml_tiling_nd *t, uint64_t tags,
+			      const struct aml_layout *l, size_t ndims, ...);
+int aml_tiling_nd_resize_acreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 const size_t *tile_dims);
+int aml_tiling_nd_resize_vcreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 va_list data);
+int aml_tiling_nd_resize_create(struct aml_tiling_nd **t, uint64_t tags,
+				const struct aml_layout *l, size_t ndims, ...);
+
+extern struct aml_tiling_nd_ops aml_tiling_nd_resize_column_ops;
+extern struct aml_tiling_nd_ops aml_tiling_nd_resize_row_ops;
+
+#endif
diff --git a/src/aml-tiling.h b/src/aml-tiling.h
new file mode 100644
index 00000000..7bf2293d
--- /dev/null
+++ b/src/aml-tiling.h
@@ -0,0 +1,44 @@
+#ifndef AML_TILING_H
+#define AML_TILING_H 1
+
+#include <stdarg.h>
+
+struct aml_tiling_nd;
+struct aml_tiling_nd_data;
+
+#define AML_TYPE_TILING_ORDER (1 << 0)
+#define AML_TYPE_TILING_MAX (1 << 1)
+
+#define AML_TYPE_TILING_ROW_ORDER 1
+#define AML_TYPE_TILING_COLUMN_ORDER 0
+
+struct aml_tiling_nd_ops {
+	struct aml_layout* (*index)(const struct aml_tiling_nd_data *,
+				    va_list coords);
+	struct aml_layout* (*aindex)(const struct aml_tiling_nd_data *,
+				     const size_t *coords);
+	int (*order)(const struct aml_tiling_nd_data *);
+        int (*tile_dims)(const struct aml_tiling_nd_data *, va_list dim_ptrs);
+	int (*tile_adims)(const struct aml_tiling_nd_data *, size_t *dims);
+	int (*dims)(const struct aml_tiling_nd_data *, va_list dim_ptrs);
+	int (*adims)(const struct aml_tiling_nd_data *, size_t *dims);
+	size_t (*ndims)(const struct aml_tiling_nd_data *);
+};
+
+struct aml_tiling_nd {
+	uint64_t tags;
+	struct aml_tiling_nd_ops *ops;
+	struct aml_tiling_nd_data *data;
+};
+
+struct aml_layout *aml_tiling_nd_index(const struct aml_tiling_nd *t, ...);
+struct aml_layout *aml_tiling_nd_aindex(const struct aml_tiling_nd *t,
+					const size_t *coords);
+int aml_tiling_nd_order(const struct aml_tiling_nd *t);
+int aml_tiling_nd_tile_dims(const struct aml_tiling_nd *t, ...);
+int aml_tiling_nd_tile_adims(const struct aml_tiling_nd *t, size_t *dims);
+int aml_tiling_nd_dims(const struct aml_tiling_nd *t, ...);
+int aml_tiling_nd_adims(const struct aml_tiling_nd *t, size_t *dims);
+size_t aml_tiling_nd_ndims(const struct aml_tiling_nd *t);
+
+#endif
diff --git a/src/aml.h b/src/aml.h
index 73f23992..8b976419 100644
--- a/src/aml.h
+++ b/src/aml.h
@@ -1,6 +1,7 @@
 #ifndef AML_H
 #define AML_H 1
 
+#include <assert.h>
 #include <inttypes.h>
 #include <numa.h>
 #include <numaif.h>
@@ -18,7 +19,15 @@
 #define PAGE_SIZE 4096
 #endif
 
-
+#include "aml-layout.h"
+#include "aml-layout-dense.h"
+#include "aml-layout-pad.h"
+#include "aml-layout-reshape.h"
+#include "aml-tiling.h"
+#include "aml-tiling-resize.h"
+#include "aml-tiling-pad.h"
+#include "aml-tiling-collapse.h"
+#include "aml-copy.h"
 /*******************************************************************************
  * Forward Declarations:
  ******************************************************************************/
@@ -1297,6 +1306,7 @@ int aml_dma_wait(struct aml_dma *dma, struct aml_dma_request *req);
  */
 int aml_dma_cancel(struct aml_dma *dma, struct aml_dma_request *req);
 
+#include "aml-dma-layout.h"
 /*******************************************************************************
  * Linux Sequential DMA API:
  * DMA logic implemented based on general linux API, with the caller thread
@@ -1483,6 +1493,8 @@ struct aml_scratch_data;
 #define AML_SCRATCH_REQUEST_TYPE_PUSH 0
 /* Pull from regular memory to the scratchpad.  */
 #define AML_SCRATCH_REQUEST_TYPE_PULL 1
+/* No-op/empty request */
+#define AML_SCRATCH_REQUEST_TYPE_NOOP 2
 
 struct aml_scratch_ops {
 	int (*create_request)(struct aml_scratch_data *scratch,
@@ -1584,6 +1596,7 @@ void* aml_scratch_baseptr(const struct aml_scratch *scratch);
  */
 int aml_scratch_release(struct aml_scratch *scratch, int scratchid);
 
+#include "aml-scratch-double.h"
 /*******************************************************************************
  * Sequential scratchpad API:
  * Scratchpad uses calling thread to trigger asynchronous dma movements.
diff --git a/src/copy.c b/src/copy.c
new file mode 100644
index 00000000..0f3f37bf
--- /dev/null
+++ b/src/copy.c
@@ -0,0 +1,665 @@
+#include <aml.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <alloca.h>
+
+static inline void aml_compute_cumulative_pitch(size_t d,
+						size_t * cumul_dst_pitch,
+						size_t * cumul_src_pitch,
+						const size_t * dst_pitch,
+						const size_t * src_pitch,
+						size_t elem_size)
+{
+	cumul_dst_pitch[0] = elem_size;
+	cumul_src_pitch[0] = elem_size;
+	for (size_t i = 0; i < d - 1; i += 1) {
+		cumul_dst_pitch[i + 1] = dst_pitch[i] * cumul_dst_pitch[i];
+		cumul_src_pitch[i + 1] = src_pitch[i] * cumul_src_pitch[i];
+	}
+}
+
+static inline void aml_copy_nd_helper(size_t d, void *dst,
+				      const size_t * cumul_dst_pitch,
+				      const void *src,
+				      const size_t * cumul_src_pitch,
+				      const size_t * elem_number,
+				      size_t elem_size)
+{
+	if (d == 1)
+		if (cumul_dst_pitch[0] == elem_size
+		    && cumul_src_pitch[0] == elem_size)
+			memcpy(dst, src, elem_number[0] * elem_size);
+		else
+			for (size_t i = 0; i < elem_number[0]; i += 1)
+				memcpy((void *)((intptr_t) dst +
+						i * cumul_dst_pitch[0]),
+				       (void *)((intptr_t) src +
+						i * cumul_src_pitch[0]),
+				       elem_size);
+	else
+		for (size_t i = 0; i < elem_number[d - 1]; i += 1) {
+			aml_copy_nd_helper(d - 1, dst, cumul_dst_pitch, src,
+					   cumul_src_pitch, elem_number,
+					   elem_size);
+			dst = (void *)((intptr_t) dst + cumul_dst_pitch[d - 1]);
+			src = (void *)((intptr_t) src + cumul_src_pitch[d - 1]);
+		}
+}
+
+int aml_copy_nd_c(size_t d, void *dst, const size_t * cumul_dst_pitch,
+		  const void *src, const size_t * cumul_src_pitch,
+		  const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	for (size_t i = 0; i < d - 1; i += 1) {
+		assert(cumul_dst_pitch[i + 1] >=
+		       cumul_dst_pitch[i] * elem_number[i]);
+		assert(cumul_src_pitch[i + 1] >=
+		       cumul_src_pitch[i] * elem_number[i]);
+	}
+	aml_copy_nd_helper(d, dst, cumul_dst_pitch, src, cumul_src_pitch,
+			   elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_nd(size_t d, void *dst, const size_t * dst_pitch, const void *src,
+		const size_t * src_pitch, const size_t * elem_number,
+		size_t elem_size)
+{
+	assert(d > 0);
+	size_t *cumul_dst_pitch;
+	size_t *cumul_src_pitch;
+	cumul_dst_pitch = (size_t *) alloca(d * sizeof(size_t));
+	cumul_src_pitch = (size_t *) alloca(d * sizeof(size_t));
+	aml_compute_cumulative_pitch(d, cumul_dst_pitch, cumul_src_pitch,
+				     dst_pitch, src_pitch, elem_size);
+	aml_copy_nd_c(d, dst, cumul_dst_pitch, src, cumul_src_pitch,
+		      elem_number, elem_size);
+	return 0;
+}
+
+static inline void aml_copy_ndstr_helper(size_t d, void *dst,
+					 const size_t * cumul_dst_pitch,
+					 const size_t * dst_stride,
+					 const void *src,
+					 const size_t * cumul_src_pitch,
+					 const size_t * src_stride,
+					 const size_t * elem_number,
+					 size_t elem_size)
+{
+	if (d == 1)
+		if (dst_stride[0] * cumul_dst_pitch[0] == elem_size
+		    && src_stride[0] * cumul_src_pitch[0] == elem_size)
+			memcpy(dst, src, elem_number[0] * elem_size);
+		else
+			for (size_t i = 0; i < elem_number[0]; i += 1)
+				memcpy((void *)((intptr_t) dst +
+						i * (dst_stride[0] *
+						     cumul_dst_pitch[0])),
+				       (void *)((intptr_t) src +
+						i * (src_stride[0] *
+						     cumul_src_pitch[0])),
+				       elem_size);
+	else
+		for (size_t i = 0; i < elem_number[d - 1]; i += 1) {
+			aml_copy_ndstr_helper(d - 1, dst, cumul_dst_pitch,
+					      dst_stride, src, cumul_src_pitch,
+					      src_stride, elem_number,
+					      elem_size);
+			dst =
+			    (void *)((intptr_t) dst +
+				     dst_stride[d - 1] * cumul_dst_pitch[d -
+									 1]);
+			src =
+			    (void *)((intptr_t) src +
+				     src_stride[d - 1] * cumul_src_pitch[d -
+									 1]);
+		}
+}
+
+int aml_copy_ndstr_c(size_t d, void *dst, const size_t * cumul_dst_pitch,
+		     const size_t * dst_stride, const void *src,
+		     const size_t * cumul_src_pitch, const size_t * src_stride,
+		     const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	for (size_t i = 0; i < d - 1; i += 1) {
+		assert(cumul_dst_pitch[i + 1] >=
+		       dst_stride[i] * cumul_dst_pitch[i] * elem_number[i]);
+		assert(cumul_src_pitch[i + 1] >=
+		       src_stride[i] * cumul_src_pitch[i] * elem_number[i]);
+	}
+	aml_copy_ndstr_helper(d, dst, cumul_dst_pitch, dst_stride, src,
+			      cumul_src_pitch, src_stride, elem_number,
+			      elem_size);
+	return 0;
+}
+
+int aml_copy_ndstr(size_t d, void *dst, const size_t * dst_pitch,
+		   const size_t * dst_stride, const void *src,
+		   const size_t * src_pitch, const size_t * src_stride,
+		   const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	size_t *cumul_dst_pitch;
+	size_t *cumul_src_pitch;
+	cumul_dst_pitch = (size_t *) alloca(d * sizeof(size_t));
+	cumul_src_pitch = (size_t *) alloca(d * sizeof(size_t));
+	aml_compute_cumulative_pitch(d, cumul_dst_pitch, cumul_src_pitch,
+				     dst_pitch, src_pitch, elem_size);
+	aml_copy_ndstr_c(d, dst, cumul_dst_pitch, dst_stride, src,
+			 cumul_src_pitch, src_stride, elem_number, elem_size);
+	return 0;
+}
+
+static inline void aml_copy_shnd_helper(size_t d, const size_t * target_dims,
+					void *dst,
+					const size_t * cumul_dst_pitch,
+					const void *src,
+					const size_t * cumul_src_pitch,
+					const size_t * elem_number,
+					size_t elem_size)
+{
+	if (d == 1)
+		if (cumul_dst_pitch[0] == elem_size
+		    && cumul_src_pitch[target_dims[0]] == elem_size)
+			memcpy(dst, src,
+			       elem_number[target_dims[0]] * elem_size);
+		else
+			for (size_t i = 0; i < elem_number[target_dims[0]];
+			     i += 1)
+				memcpy((void *)((intptr_t) dst +
+						i * cumul_dst_pitch[0]),
+				       (void *)((intptr_t) src +
+						i *
+						cumul_src_pitch[target_dims
+								[0]]),
+				       elem_size);
+	else
+		for (size_t i = 0; i < elem_number[target_dims[d - 1]]; i += 1) {
+			aml_copy_shnd_helper(d - 1, target_dims, dst,
+					     cumul_dst_pitch, src,
+					     cumul_src_pitch, elem_number,
+					     elem_size);
+			dst = (void *)((intptr_t) dst + cumul_dst_pitch[d - 1]);
+			src =
+			    (void *)((intptr_t) src +
+				     cumul_src_pitch[target_dims[d - 1]]);
+		}
+}
+
+int aml_copy_shnd_c(size_t d, const size_t * target_dims, void *dst,
+		    const size_t * cumul_dst_pitch, const void *src,
+		    const size_t * cumul_src_pitch, const size_t * elem_number,
+		    size_t elem_size)
+{
+	assert(d > 0);
+	size_t present_dims;
+	present_dims = 0;
+	for (size_t i = 0; i < d; i += 1) {
+		assert(target_dims[i] < d);
+		present_dims |= 1 << target_dims[i];
+	}
+	for (size_t i = 0; i < d; i += 1)
+		assert(present_dims & 1 << i);
+	for (size_t i = 0; i < d - 1; i += 1) {
+		assert(cumul_dst_pitch[i + 1] >=
+		       cumul_dst_pitch[i] * elem_number[target_dims[i]]);
+		assert(cumul_src_pitch[i + 1] >=
+		       cumul_src_pitch[i] * elem_number[i]);
+	}
+	aml_copy_shnd_helper(d, target_dims, dst, cumul_dst_pitch, src,
+			     cumul_src_pitch, elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_shnd(size_t d, const size_t * target_dims, void *dst,
+		  const size_t * dst_pitch, const void *src,
+		  const size_t * src_pitch, const size_t * elem_number,
+		  size_t elem_size)
+{
+	assert(d > 0);
+	size_t *cumul_dst_pitch;
+	size_t *cumul_src_pitch;
+	cumul_dst_pitch = (size_t *) alloca(d * sizeof(size_t));
+	cumul_src_pitch = (size_t *) alloca(d * sizeof(size_t));
+	aml_compute_cumulative_pitch(d, cumul_dst_pitch, cumul_src_pitch,
+				     dst_pitch, src_pitch, elem_size);
+	aml_copy_shnd_c(d, target_dims, dst, cumul_dst_pitch, src,
+			cumul_src_pitch, elem_number, elem_size);
+	return 0;
+}
+
+static inline void aml_copy_shndstr_helper(size_t d, const size_t * target_dims,
+					   void *dst,
+					   const size_t * cumul_dst_pitch,
+					   const size_t * dst_stride,
+					   const void *src,
+					   const size_t * cumul_src_pitch,
+					   const size_t * src_stride,
+					   const size_t * elem_number,
+					   size_t elem_size)
+{
+	if (d == 1)
+		if (dst_stride[0] * cumul_dst_pitch[0] == elem_size
+		    && src_stride[target_dims[0]] *
+		    cumul_src_pitch[target_dims[0]] == elem_size)
+			memcpy(dst, src,
+			       elem_number[target_dims[0]] * elem_size);
+		else
+			for (size_t i = 0; i < elem_number[target_dims[0]];
+			     i += 1)
+				memcpy((void *)((intptr_t) dst +
+						i * (dst_stride[0] *
+						     cumul_dst_pitch[0])),
+				       (void *)((intptr_t) src +
+						i *
+						(src_stride[target_dims[0]] *
+						 cumul_src_pitch[target_dims
+								 [0]])),
+				       elem_size);
+	else
+		for (size_t i = 0; i < elem_number[target_dims[d - 1]]; i += 1) {
+			aml_copy_shndstr_helper(d - 1, target_dims, dst,
+						cumul_dst_pitch, dst_stride,
+						src, cumul_src_pitch,
+						src_stride, elem_number,
+						elem_size);
+			dst =
+			    (void *)((intptr_t) dst +
+				     dst_stride[d - 1] * cumul_dst_pitch[d -
+									 1]);
+			src =
+			    (void *)((intptr_t) src +
+				     src_stride[target_dims[d - 1]] *
+				     cumul_src_pitch[target_dims[d - 1]]);
+		}
+}
+
+int aml_copy_shndstr_c(size_t d, const size_t * target_dims, void *dst,
+		       const size_t * cumul_dst_pitch,
+		       const size_t * dst_stride, const void *src,
+		       const size_t * cumul_src_pitch,
+		       const size_t * src_stride, const size_t * elem_number,
+		       size_t elem_size)
+{
+	assert(d > 0);
+	size_t present_dims;
+	present_dims = 0;
+	for (size_t i = 0; i < d; i += 1) {
+		assert(target_dims[i] < d);
+		present_dims |= 1 << target_dims[i];
+	}
+	for (size_t i = 0; i < d; i += 1)
+		assert(present_dims & 1 << i);
+	for (size_t i = 0; i < d - 1; i += 1) {
+		assert(cumul_dst_pitch[i + 1] >=
+		       dst_stride[i] * cumul_dst_pitch[i] *
+		       elem_number[target_dims[i]]);
+		assert(cumul_src_pitch[i + 1] >=
+		       src_stride[i] * cumul_src_pitch[i] * elem_number[i]);
+	}
+	aml_copy_shndstr_helper(d, target_dims, dst, cumul_dst_pitch,
+				dst_stride, src, cumul_src_pitch, src_stride,
+				elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_shndstr(size_t d, const size_t * target_dims, void *dst,
+		     const size_t * dst_pitch, const size_t * dst_stride,
+		     const void *src, const size_t * src_pitch,
+		     const size_t * src_stride, const size_t * elem_number,
+		     size_t elem_size)
+{
+	assert(d > 0);
+	size_t *cumul_dst_pitch;
+	size_t *cumul_src_pitch;
+	cumul_dst_pitch = (size_t *) alloca(d * sizeof(size_t));
+	cumul_src_pitch = (size_t *) alloca(d * sizeof(size_t));
+	aml_compute_cumulative_pitch(d, cumul_dst_pitch, cumul_src_pitch,
+				     dst_pitch, src_pitch, elem_size);
+	aml_copy_shndstr_c(d, target_dims, dst, cumul_dst_pitch, dst_stride,
+			   src, cumul_src_pitch, src_stride, elem_number,
+			   elem_size);
+	return 0;
+}
+
+int aml_copy_tnd(size_t d, void *dst, const size_t * dst_pitch, const void *src,
+		 const size_t * src_pitch, const size_t * elem_number,
+		 size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[d - 1] = 0;
+	for (size_t i = 0; i < d - 1; i += 1)
+		target_dims[i] = i + 1;
+	aml_copy_shnd(d, target_dims, dst, dst_pitch, src, src_pitch,
+		      elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_tnd_c(size_t d, void *dst, const size_t * cumul_dst_pitch,
+		   const void *src, const size_t * cumul_src_pitch,
+		   const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[d - 1] = 0;
+	for (size_t i = 0; i < d - 1; i += 1)
+		target_dims[i] = i + 1;
+	aml_copy_shnd_c(d, target_dims, dst, cumul_dst_pitch, src,
+			cumul_src_pitch, elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_rtnd(size_t d, void *dst, const size_t * dst_pitch,
+		  const void *src, const size_t * src_pitch,
+		  const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[0] = d - 1;
+	for (size_t i = 1; i < d; i += 1)
+		target_dims[i] = i - 1;
+	aml_copy_shnd(d, target_dims, dst, dst_pitch, src, src_pitch,
+		      elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_rtnd_c(size_t d, void *dst, const size_t * cumul_dst_pitch,
+		    const void *src, const size_t * cumul_src_pitch,
+		    const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[0] = d - 1;
+	for (size_t i = 1; i < d; i += 1)
+		target_dims[i] = i - 1;
+	aml_copy_shnd_c(d, target_dims, dst, cumul_dst_pitch, src,
+			cumul_src_pitch, elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_tndstr(size_t d, void *dst, const size_t * dst_pitch,
+		    const size_t * dst_stride, const void *src,
+		    const size_t * src_pitch, const size_t * src_stride,
+		    const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[d - 1] = 0;
+	for (size_t i = 0; i < d - 1; i += 1)
+		target_dims[i] = i + 1;
+	aml_copy_shndstr(d, target_dims, dst, dst_pitch, dst_stride, src,
+			 src_pitch, src_stride, elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_tndstr_c(size_t d, void *dst, const size_t * cumul_dst_pitch,
+		      const size_t * dst_stride, const void *src,
+		      const size_t * cumul_src_pitch, const size_t * src_stride,
+		      const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[d - 1] = 0;
+	for (size_t i = 0; i < d - 1; i += 1)
+		target_dims[i] = i + 1;
+	aml_copy_shndstr_c(d, target_dims, dst, cumul_dst_pitch, dst_stride,
+			   src, cumul_src_pitch, src_stride, elem_number,
+			   elem_size);
+	return 0;
+}
+
+int aml_copy_rtndstr(size_t d, void *dst, const size_t * dst_pitch,
+		     const size_t * dst_stride, const void *src,
+		     const size_t * src_pitch, const size_t * src_stride,
+		     const size_t * elem_number, size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[0] = d - 1;
+	for (size_t i = 1; i < d; i += 1)
+		target_dims[i] = i - 1;
+	aml_copy_shndstr(d, target_dims, dst, dst_pitch, dst_stride, src,
+			 src_pitch, src_stride, elem_number, elem_size);
+	return 0;
+}
+
+int aml_copy_rtndstr_c(size_t d, void *dst, const size_t * cumul_dst_pitch,
+		       const size_t * dst_stride, const void *src,
+		       const size_t * cumul_src_pitch,
+		       const size_t * src_stride, const size_t * elem_number,
+		       size_t elem_size)
+{
+	assert(d > 0);
+	size_t *target_dims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[0] = d - 1;
+	for (size_t i = 1; i < d; i += 1)
+		target_dims[i] = i - 1;
+	aml_copy_shndstr_c(d, target_dims, dst, cumul_dst_pitch, dst_stride,
+			   src, cumul_src_pitch, src_stride, elem_number,
+			   elem_size);
+	return 0;
+}
+
+int aml_copy_layout_native(struct aml_layout *dst, const struct aml_layout *src)
+{
+	size_t d;
+	size_t elem_size;
+	struct aml_layout_data_native *ddst;
+	struct aml_layout_data_native *dsrc;
+	ddst = (struct aml_layout_data_native *)dst->data;
+	dsrc = (struct aml_layout_data_native *)src->data;
+	d = dsrc->ndims;
+	assert(d > 0);
+	elem_size = dsrc->cpitch[0];
+	assert(d == ddst->ndims);
+	assert(elem_size == ddst->cpitch[0]);
+	for (size_t i = 0; i < d; i += 1)
+		assert(dsrc->dims[i] == ddst->dims[i]);
+	return aml_copy_ndstr_c(d, ddst->ptr, ddst->cpitch, ddst->stride,
+				dsrc->ptr, dsrc->cpitch, dsrc->stride,
+				dsrc->dims, elem_size);
+}
+
+int aml_copy_layout_transform_native(struct aml_layout *dst,
+				     const struct aml_layout *src,
+				     const size_t * target_dims)
+{
+	size_t d;
+	size_t elem_size;
+	struct aml_layout_data_native *ddst;
+	struct aml_layout_data_native *dsrc;
+	ddst = (struct aml_layout_data_native *)dst->data;
+	dsrc = (struct aml_layout_data_native *)src->data;
+	d = dsrc->ndims;
+	assert(d > 0);
+	elem_size = dsrc->cpitch[0];
+	assert(d == ddst->ndims);
+	assert(elem_size == ddst->cpitch[0]);
+	for (size_t i = 0; i < d; i += 1)
+		assert(dsrc->dims[target_dims[i]] == ddst->dims[i]);
+	return aml_copy_shndstr_c(d, target_dims, ddst->ptr, ddst->cpitch,
+				  ddst->stride, dsrc->ptr, dsrc->cpitch,
+				  dsrc->stride, dsrc->dims, elem_size);
+}
+
+int aml_copy_layout_transpose_native(struct aml_layout *dst,
+				     const struct aml_layout *src)
+{
+	size_t d;
+	size_t *target_dims;
+	struct aml_layout_data_native *dsrc;
+	dsrc = (struct aml_layout_data_native *)src->data;
+	d = dsrc->ndims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[d - 1] = 0;
+	for (size_t i = 0; i < d - 1; i += 1)
+		target_dims[i] = i + 1;
+	return aml_copy_layout_transform_native(dst, src, target_dims);
+}
+
+int aml_copy_layout_reverse_transpose_native(struct aml_layout *dst,
+					     const struct aml_layout *src)
+{
+	size_t d;
+	size_t *target_dims;
+	struct aml_layout_data_native *dsrc;
+	dsrc = (struct aml_layout_data_native *)src->data;
+	d = dsrc->ndims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[0] = d - 1;
+	for (size_t i = 1; i < d; i += 1)
+		target_dims[i] = i - 1;
+	return aml_copy_layout_transform_native(dst, src, target_dims);
+}
+
+static inline void aml_copy_layout_generic_helper(size_t d,
+						  struct aml_layout *dst,
+						  const struct aml_layout *src,
+						  const size_t * elem_number,
+						  size_t elem_size,
+						  size_t * coords)
+{
+	if (d == 1)
+		for (size_t i = 0; i < elem_number[0]; i += 1) {
+			coords[0] = i;
+			coords[0] = i;
+			memcpy(aml_layout_aderef_column(dst, coords),
+			       aml_layout_aderef_column(src, coords),
+			       elem_size);
+	} else
+		for (size_t i = 0; i < elem_number[d - 1]; i += 1) {
+			coords[d - 1] = i;
+			coords[d - 1] = i;
+			aml_copy_layout_generic_helper(d - 1, dst, src,
+						       elem_number, elem_size,
+						       coords);
+		}
+}
+
+static inline void aml_copy_layout_transform_generic_helper(size_t d,
+							    struct aml_layout
+							    *dst,
+							    const struct
+							    aml_layout *src,
+							    const size_t *
+							    elem_number,
+							    size_t elem_size,
+							    size_t * coords,
+							    size_t * coords_out,
+							    const size_t *
+							    target_dims)
+{
+	if (d == 1)
+		for (size_t i = 0; i < elem_number[target_dims[0]]; i += 1) {
+			coords_out[0] = i;
+			coords[target_dims[0]] = i;
+			memcpy(aml_layout_aderef_column(dst, coords_out),
+			       aml_layout_aderef_column(src, coords),
+			       elem_size);
+	} else
+		for (size_t i = 0; i < elem_number[target_dims[d - 1]]; i += 1) {
+			coords_out[d - 1] = i;
+			coords[target_dims[d - 1]] = i;
+			aml_copy_layout_transform_generic_helper(d - 1, dst,
+								 src,
+								 elem_number,
+								 elem_size,
+								 coords,
+								 coords_out,
+								 target_dims);
+		}
+}
+
+int aml_copy_layout_generic(struct aml_layout *dst,
+			    const struct aml_layout *src)
+{
+	size_t d;
+	size_t elem_size;
+	size_t *coords;
+	size_t *elem_number;
+	size_t *elem_number2;
+	assert(aml_layout_ndims(dst) == aml_layout_ndims(src));
+	d = aml_layout_ndims(dst);
+	assert(aml_layout_element_size(dst) == aml_layout_element_size(src));
+	elem_size = aml_layout_element_size(dst);
+	coords = (size_t *) alloca(d * sizeof(size_t));
+	elem_number = (size_t *) alloca(d * sizeof(size_t));
+	elem_number2 = (size_t *) alloca(d * sizeof(size_t));
+	aml_layout_adims_column(src, elem_number);
+	aml_layout_adims_column(dst, elem_number2);
+	for (size_t i = 0; i < d; i += 1)
+		assert(elem_number[i] == elem_number2[i]);
+	aml_copy_layout_generic_helper(d, dst, src, elem_number, elem_size,
+				       coords);
+	return 0;
+}
+
+int aml_copy_layout_transform_generic(struct aml_layout *dst,
+				      const struct aml_layout *src,
+				      const size_t * target_dims)
+{
+	size_t d;
+	size_t elem_size;
+	size_t *coords;
+	size_t *coords_out;
+	size_t *elem_number;
+	size_t *elem_number2;
+	assert(aml_layout_ndims(dst) == aml_layout_ndims(src));
+	d = aml_layout_ndims(dst);
+	assert(aml_layout_element_size(dst) == aml_layout_element_size(src));
+	elem_size = aml_layout_element_size(dst);
+	coords = (size_t *) alloca(d * sizeof(size_t));
+	coords_out = (size_t *) alloca(d * sizeof(size_t));
+	elem_number = (size_t *) alloca(d * sizeof(size_t));
+	elem_number2 = (size_t *) alloca(d * sizeof(size_t));
+	aml_layout_adims_column(src, elem_number);
+	aml_layout_adims_column(dst, elem_number2);
+	for (size_t i = 0; i < d; i += 1)
+		assert(elem_number[target_dims[i]] == elem_number2[i]);
+	aml_copy_layout_transform_generic_helper(d, dst, src, elem_number,
+						 elem_size, coords, coords_out,
+						 target_dims);
+	return 0;
+}
+
+int aml_copy_layout_transpose_generic(struct aml_layout *dst,
+				      const struct aml_layout *src)
+{
+	size_t d;
+	size_t *target_dims;
+	struct aml_layout_data_native *dsrc;
+	dsrc = (struct aml_layout_data_native *)src->data;
+	d = dsrc->ndims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[d - 1] = 0;
+	for (size_t i = 0; i < d - 1; i += 1)
+		target_dims[i] = i + 1;
+	return aml_copy_layout_transform_generic(dst, src, target_dims);
+}
+
+int aml_copy_layout_reverse_transpose_generic(struct aml_layout *dst,
+					      const struct aml_layout *src)
+{
+	size_t d;
+	size_t *target_dims;
+	struct aml_layout_data_native *dsrc;
+	dsrc = (struct aml_layout_data_native *)src->data;
+	d = dsrc->ndims;
+	target_dims = (size_t *) alloca(d * sizeof(size_t));
+	target_dims[0] = d - 1;
+	for (size_t i = 1; i < d; i += 1)
+		target_dims[i] = i - 1;
+	return aml_copy_layout_transform_generic(dst, src, target_dims);
+}
diff --git a/src/copy.rb b/src/copy.rb
new file mode 100644
index 00000000..e276ed54
--- /dev/null
+++ b/src/copy.rb
@@ -0,0 +1,576 @@
+stdin1, stdout0 = IO.pipe
+stdin2, stdout1 = IO.pipe
+
+pid1 = Process.fork {
+  stdout0.close
+  stdin2.close
+  require 'cast'
+
+  parser = C::Parser::new
+  parser.type_names << '__builtin_va_list'
+  cpp = C::Preprocessor::new
+  cpp.macros['__attribute__(a)'] = ''
+  cpp.macros['__restrict'] = 'restrict'
+  cpp.macros['__extension__'] = ''
+  cpp.macros['__asm__(a)'] = ''
+  cpp.include_path << './'
+
+
+
+  preprocessed_sources = cpp.preprocess(<<EOF).gsub(/^#.*?$/, '')
+#include <stddef.h>
+#include <stdint.h>
+#include <aml-layout.h>
+#include <aml-layout-dense.h>
+#include <string.h>
+#include <alloca.h>
+EOF
+
+  parser.parse(preprocessed_sources)
+
+  ast = parser.parse(stdin1.read)
+  stdin1.close
+
+  ast.postorder { |n|
+    n.stmt = n.stmt.stmts.first if n.For? && n.stmt.Block? && n.stmt.stmts.size == 1
+    n.then = n.then.stmts.first if n.If? && n.then.Block? && n.then.stmts.size == 1
+    n.else = n.else.stmts.first if n.If? && n.else && n.else.Block? && n.else.stmts.size == 1
+  }
+
+  stdout1.puts <<EOF
+#include <aml.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <alloca.h>
+
+EOF
+  stdout1.puts ast
+  stdout1.close
+}
+
+pid2 = Process.fork {
+  stdin1.close
+  stdout0.close
+  stdout1.close
+  require 'open3'
+  Open3.popen3('indent -nbad -bap -nbc -bbo -hnl -br -brs -c33 -cd33 -ncdb -ce -ci4 -cli0 -d0 -di1 -nfc1 -i8 -ip0 -l80 -lp -npcs -nprs -npsl -sai -saf -saw -ncs -nsc -sob -nfca -cp33 -ss -ts8 -il1') do |i, o, t|
+    i.write stdin2.read
+    stdin2.close
+    i.close
+    puts o.read
+  end
+}
+
+stdin1.close
+stdout1.close
+stdin2.close
+
+require 'BOAST'
+include BOAST
+
+set_array_start(0)
+set_lang(C)
+set_default_int_size(nil)
+set_output(stdout0)
+
+register_funccall( :alloca )
+register_funccall( :memcpy )
+register_funccall( :assert )
+register_funccall( :sizeof )
+
+def name_prefix
+  "aml_copy_"
+end
+
+def name(suffix = nil, stride: false, shuffle: false)
+  name = name_prefix
+  name << "sh" if shuffle
+  name << "nd"
+  name << "str" if stride
+  name << "_#{suffix}" if suffix
+  name
+end
+
+def transpose_name(reverse: false, stride: false, cumulative: false)
+  name = name_prefix
+  name << "r" if reverse
+  name << "tnd"
+  name << "str" if stride
+  name << "_c" if cumulative
+  name
+end
+
+def aml_compute_cumulative_pitch
+  d = Sizet :d
+  cumul_dst_pitch = Sizet :cumul_dst_pitch, dim: Dim(d), dir: :out
+  cumul_src_pitch = Sizet :cumul_src_pitch, dim: Dim(d), dir: :out
+  dst_pitch = Sizet :dst_pitch, dim: Dim(d), dir: :in
+  src_pitch = Sizet :src_pitch, dim: Dim(d), dir: :in
+  elem_size = Sizet :elem_size
+  i = Sizet :i
+  p = Procedure( :aml_compute_cumulative_pitch,
+                 [ d,
+                   cumul_dst_pitch, cumul_src_pitch,
+                   dst_pitch, src_pitch,
+                   elem_size ],
+                 local: true,
+                 inline: true ) {
+    pr cumul_dst_pitch[0] === elem_size;
+    pr cumul_src_pitch[0] === elem_size;
+    
+    pr For(i, 0, d - 1, operator: '<', declit: true) {
+      pr cumul_dst_pitch[i + 1] === dst_pitch[i] * cumul_dst_pitch[i]
+      pr cumul_src_pitch[i + 1] === src_pitch[i] * cumul_src_pitch[i]
+    } 
+  }
+end
+
+def aml_copy_nd_helper(stride: false, shuffle: false)
+  d = Sizet :d
+  target_dims = Sizet :target_dims, dim: Dim(), dir: :in
+  dst = Pointer :dst, dir: :out
+  cumul_dst_pitch = Sizet :cumul_dst_pitch, dim: Dim(), dir: :in
+  dst_stride = Sizet :dst_stride, dim: Dim(), dir: :in
+  src = Pointer :src, dir: :in
+  cumul_src_pitch = Sizet :cumul_src_pitch, dim: Dim(), dir: :in
+  src_stride = Sizet :src_stride, dim: Dim(), dir: :in
+  elem_number = Sizet :elem_number, dim: Dim(), dir: :in
+  elem_size = Sizet :elem_size
+  i = Sizet :i
+
+  args = []
+  args += [ d ]
+  args += [ target_dims ] if shuffle
+  args += [ dst, cumul_dst_pitch ]
+  args += [ dst_stride ] if stride
+  args += [ src, cumul_src_pitch ]
+  args += [ src_stride ] if stride
+  args += [ elem_number, elem_size ]
+
+  effective_dst_pitch = lambda { |d| cumul_dst_pitch[d] }
+  effective_src_pitch = lambda { |d| cumul_src_pitch[d] }
+  if stride
+    tmp_dst = effective_dst_pitch
+    effective_dst_pitch = lambda { |d| dst_stride[d] * tmp_dst[d] }
+    tmp_src = effective_src_pitch
+    effective_src_pitch = lambda { |d| src_stride[d] * tmp_src[d] }
+  end
+
+  src_index = lambda { |d| d }
+  dst_index = lambda { |d| d }
+  elem_index = lambda { |d| d }
+  if shuffle
+    elem_index = lambda { |d| target_dims[d] }
+    src_index = lambda { |d| target_dims[d] }
+  end
+
+  name = name(:helper, stride: stride, shuffle: shuffle)
+
+  p = Procedure( name,
+                 args,
+                 local: true,
+                 inline: true ) {
+    pr If( d == 1 => lambda {
+      pr If( And(effective_dst_pitch[dst_index[0]] == elem_size,
+                 effective_src_pitch[src_index[0]] == elem_size) => lambda {
+        pr memcpy(dst, src, elem_number[elem_index[0]] * elem_size) 
+      }, else: lambda {
+        pr For( i, 0, elem_number[elem_index[0]], operator: '<', declit: true ) {
+          pr memcpy( (dst.cast(Intptrt) + i * effective_dst_pitch[dst_index[0]]).cast(dst),
+                     (src.cast(Intptrt) + i * effective_src_pitch[src_index[0]]).cast(src),
+                     elem_size)
+        }
+      })
+    }, else: lambda {
+      pr For( i, 0, elem_number[elem_index[d - 1]], operator: '<', declit: true ) {
+        args[0] = d - 1
+        pr p.call(*args)
+        pr dst === (dst.cast(Intptrt) + effective_dst_pitch[dst_index[d - 1]]).cast(dst)
+        pr src === (src.cast(Intptrt) + effective_src_pitch[src_index[d - 1]]).cast(src)
+      }
+    })
+ 
+  }
+end
+
+def aml_copy_nd_c(stride: false, shuffle: false)
+  d = Sizet :d
+  target_dims = Sizet :target_dims, dim: Dim(), dir: :in
+  dst = Pointer :dst, dir: :out
+  cumul_dst_pitch = Sizet :cumul_dst_pitch, dim: Dim(d), dir: :in
+  dst_stride = Sizet :dst_stride, dim: Dim(d), dir: :in
+  src = Pointer :src, dir: :in
+  cumul_src_pitch = Sizet :cumul_src_pitch, dim: Dim(d), dir: :in
+  src_stride = Sizet :src_stride, dim: Dim(d), dir: :in
+  elem_number = Sizet :elem_number, dim: Dim(d), dir: :in
+  elem_size = Sizet :elem_size
+  i = Sizet :i
+  present_dims = Sizet :present_dims
+
+  args = []
+  args += [ d ]
+  args += [ target_dims ] if shuffle
+  args += [ dst, cumul_dst_pitch ]
+  args += [ dst_stride ] if stride
+  args += [ src, cumul_src_pitch]
+  args += [ src_stride ] if stride
+  args += [ elem_number, elem_size]
+
+  effective_dst_pitch = lambda { |d| cumul_dst_pitch[d] }
+  effective_src_pitch = lambda { |d| cumul_src_pitch[d] }
+  if stride
+    tmp_dst = effective_dst_pitch
+    effective_dst_pitch = lambda { |d| dst_stride[d] * tmp_dst[d] }
+    tmp_src = effective_src_pitch
+    effective_src_pitch = lambda { |d| src_stride[d] * tmp_src[d] }
+  end
+
+  elem_index = lambda { |d| d }
+  if shuffle
+    elem_index = lambda { |d| target_dims[d] }
+  end
+
+  name = name(:c, stride: stride, shuffle: shuffle)
+
+  p = Procedure( name,
+                 args,
+                 return_type: Int ) {
+    pr assert(d > 0)
+    if shuffle
+      decl present_dims
+      pr present_dims === 0
+      pr For(i, 0, d, operator: '<', declit: true ) {
+        pr assert(target_dims[i] < d)
+        get_output.puts "#{present_dims} |= 1 << #{target_dims[i]};"
+      }
+      pr For(i, 0, d, operator: '<', declit: true ) {
+        pr assert("#{present_dims} & (1 << #{i})")
+      }
+    end
+    pr For(i, 0, d - 1, operator: '<', declit: true ) {
+      pr assert(cumul_dst_pitch[i + 1] >= effective_dst_pitch[i] * elem_number[elem_index[i]]);
+      pr assert(cumul_src_pitch[i + 1] >= effective_src_pitch[i] * elem_number[i]);
+    }
+    pr aml_copy_nd_helper(stride: stride, shuffle: shuffle).call( *args )
+    pr Return(0)
+  }
+end
+
+def aml_copy_nd(stride: false, shuffle: false)
+  d = Sizet :d
+  target_dims = Sizet :target_dims, dim: Dim(d), dir: :in
+  dst = Pointer :dst, dir: :out
+  dst_pitch = Sizet :dst_pitch, dim: Dim(d), dir: :in
+  dst_stride = Sizet :dst_stride, dim: Dim(d), dir: :in
+  src = Pointer :src, dir: :in
+  src_pitch = Sizet :src_pitch, dim: Dim(d), dir: :in
+  src_stride = Sizet :src_stride, dim: Dim(d), dir: :in
+  elem_number = Sizet :elem_number, dim: Dim(d), dir: :in
+  elem_size = Sizet :elem_size
+  cumul_dst_pitch = Pointer :cumul_dst_pitch, type: Sizet
+  cumul_src_pitch = Pointer :cumul_src_pitch, type: Sizet
+
+  args = []
+  args += [ d ]
+  args += [ target_dims ] if shuffle
+  args += [ dst, dst_pitch ]
+  args += [ dst_stride ] if stride
+  args += [ src, src_pitch]
+  args += [ src_stride ] if stride
+  args += [ elem_number, elem_size]
+
+  name = name(stride: stride, shuffle: shuffle)
+
+  p = Procedure( name,
+                 args,
+                 return_type: Int ) {
+    pr assert(d > 0);
+    decl cumul_dst_pitch, cumul_src_pitch
+    pr cumul_dst_pitch === alloca(d * sizeof("size_t")).cast(cumul_dst_pitch)
+    pr cumul_src_pitch === alloca(d * sizeof("size_t")).cast(cumul_src_pitch)
+    pr $aml_compute_cumulative_pitch.call(d, cumul_dst_pitch, cumul_src_pitch,
+                                          dst_pitch, src_pitch, elem_size);
+    args = []
+    args += [ d ]
+    args += [ target_dims ] if shuffle
+    args += [ dst, cumul_dst_pitch ]
+    args += [ dst_stride ] if stride
+    args += [ src, cumul_src_pitch]
+    args += [ src_stride ] if stride
+    args +=  [ elem_number, elem_size]
+
+    pr aml_copy_nd_c(stride: stride, shuffle: shuffle).call( *args )
+    pr Return(0)
+  }
+end
+
+def aml_copy_tnd(reverse: false, stride: false, cumulative: false)
+  d = Sizet :d
+  dst = Pointer :dst, dir: :out
+  dst_pitch = Sizet :dst_pitch, dim: Dim(d), dir: :in
+  cumul_dst_pitch = Sizet :cumul_dst_pitch, dim: Dim(d), dir: :in
+  dst_stride = Sizet :dst_stride, dim: Dim(d), dir: :in
+  src = Pointer :src, dir: :in
+  src_pitch = Sizet :src_pitch, dim: Dim(d), dir: :in
+  src_stride = Sizet :src_stride, dim: Dim(d), dir: :in
+  cumul_src_pitch = Sizet :cumul_src_pitch, dim: Dim(d), dir: :in
+  elem_number = Sizet :elem_number, dim: Dim(d), dir: :in
+  elem_size = Sizet :elem_size
+
+  args = []
+  args += [ d, dst ]
+  args += cumulative ? [ cumul_dst_pitch ] : [ dst_pitch ] 
+  args += [ dst_stride ] if stride
+  args += [ src ]
+  args += cumulative ? [ cumul_src_pitch ] : [ src_pitch ]
+  args += [ src_stride ] if stride
+  args += [ elem_number, elem_size]
+
+  target_dims = Sizet :target_dims, dim: Dim(d)
+  i = Sizet :i
+
+  name = transpose_name(reverse: reverse, stride: stride, cumulative: cumulative)
+
+  p = Procedure( name,
+                 args,
+                 return_type: Int ) {
+    pr assert(d > 0);
+    decl target_dims
+    pr target_dims === alloca(d * sizeof("size_t")).cast(target_dims)
+    if reverse
+      pr target_dims[0] === d - 1
+      pr For(i, 1, d, operator: '<', declit: true) {
+        pr target_dims[i] === i - 1
+      }
+    else
+      pr target_dims[d - 1] === 0
+      pr For(i, 0, d - 1, operator: '<', declit: true) {
+        pr target_dims[i] === i + 1
+      }
+    end
+
+    args.insert(1, target_dims)
+
+    if cumulative
+      pr aml_copy_nd_c(stride: stride, shuffle: true).call(*args)
+    else
+      pr aml_copy_nd(stride: stride, shuffle: true).call(*args)
+    end
+    pr Return(0)
+  }
+end
+
+def aml_copy_layout_generic_helper(shuffle: false)
+  d = Sizet :d
+  dst = Pointer :dst, type: CStruct::new(type_name: :aml_layout, members: {}), dir: :inout
+  src = Pointer :src, type: CStruct::new(type_name: :aml_layout, members: {}), dir: :in
+  elem_number = Sizet :elem_number, dim: Dim(), dir: :in
+  elem_size = Sizet :elem_size
+  coords = Sizet :coords, dim: Dim(), dir: :inout
+  coords_out = Sizet :coords_out, dim: Dim(), dir: :inout
+  target_dims = Sizet :target_dims, dim: Dim(), dir: :in
+
+  i = Sizet :i
+
+  name = name_prefix + "layout_"
+  name << "transform_" if shuffle
+  name << "generic_helper"
+
+  args = [d, dst, src, elem_number, elem_size, coords]
+  args << coords_out << target_dims if shuffle
+
+  src_index = lambda { |d| d }
+  dst_index = lambda { |d| d }
+  elem_index = lambda { |d| d }
+  if shuffle
+    elem_index = lambda { |d| target_dims[d] }
+    src_index = lambda { |d| target_dims[d] }
+  end
+
+  coord_src = coords
+  coord_dst = coords
+  if shuffle
+    coord_dst = coords_out
+  end
+
+  p = Procedure( name, args, local: true, inline: true ) {
+    pr If( d == 1 => lambda {
+      pr For( i, 0, elem_number[elem_index[0]], operator: '<', declit: true ) {
+        pr coord_dst[dst_index[0]] === i
+        pr coord_src[src_index[0]] === i
+        pr memcpy( FuncCall(:aml_layout_aderef_column, dst, coord_dst), FuncCall(:aml_layout_aderef_column, src, coord_src), elem_size )
+      }
+    }, else: lambda {
+      pr For( i, 0, elem_number[elem_index[d - 1]], operator: '<', declit: true ) {
+        args[0] = d - 1
+        pr coord_dst[dst_index[d - 1]] === i
+        pr coord_src[src_index[d - 1]] === i
+        pr p.call(*args)
+      }
+    })
+  }
+end
+
+def aml_copy_layout(native: true, shuffle: false)
+  dst = Pointer :dst, type: CStruct::new(type_name: :aml_layout, members: {}), dir: :inout
+  src = Pointer :src, type: CStruct::new(type_name: :aml_layout, members: {}), dir: :in
+  target_dims = Sizet :target_dims, dim: Dim(), dir: :in
+
+  ddst = Pointer :ddst, type: CStruct::new(type_name: :aml_layout_data_native, members: {})
+  dsrc = Pointer :dsrc, type: CStruct::new(type_name: :aml_layout_data_native, members: {})
+  d = Sizet :d
+  elem_size = Sizet :elem_size
+  i = Sizet :i
+
+  src_index = lambda { |d| d }
+  dst_index = lambda { |d| d }
+  if shuffle
+    src_index = lambda { |d| target_dims[d] }
+  end
+
+  name = name_prefix + "layout_"
+  name << "transform_" if shuffle
+  name << (native ? "native" : "generic")
+
+  args = [dst, src]
+  args << target_dims if shuffle
+
+  p = Procedure( name, args, return_type: Int ) {
+    decl d, elem_size
+
+    if native
+      decl ddst, dsrc
+
+      pr ddst === "(struct aml_layout_data_native *)#{dst}->data"
+      pr dsrc === "(struct aml_layout_data_native *)#{src}->data"
+      pr d === "#{dsrc}->ndims"
+      pr assert(d > 0);
+
+      pr elem_size === "#{dsrc}->cpitch[0]"
+      pr assert(d == "#{ddst}->ndims")
+      pr assert(elem_size == "#{ddst}->cpitch[0]")
+      pr For(i, 0, d, operator: '<', declit: true) {
+        pr assert( "#{dsrc}->dims[#{src_index[i]}] == #{ddst}->dims[#{dst_index[i]}]" )
+      }
+
+      args = []
+      args += [ d ]
+      args += [ target_dims ] if shuffle
+      args += [ "#{ddst}->ptr", "#{ddst}->cpitch", "#{ddst}->stride",
+                "#{dsrc}->ptr", "#{dsrc}->cpitch", "#{dsrc}->stride",
+                "#{dsrc}->dims", elem_size ]
+      pr Return(aml_copy_nd_c(stride: true, shuffle: shuffle).call(*args))
+    else
+      coords = Sizet :coords, dim: Dim()
+      coords_out = Sizet :coords_out, dim: Dim()
+      elem_number = Sizet :elem_number, dim: Dim()
+      elem_number2 = Sizet :elem_number2, dim: Dim()
+      decl coords
+      decl coords_out if shuffle
+      decl elem_number
+      decl elem_number2
+
+      pr assert( FuncCall( :aml_layout_ndims, dst ) == FuncCall( :aml_layout_ndims, src ) )
+      pr d === FuncCall( :aml_layout_ndims, dst )
+      pr assert( FuncCall( :aml_layout_element_size, dst ) == FuncCall( :aml_layout_element_size, src ) )
+      pr elem_size === FuncCall( :aml_layout_element_size, dst )
+      pr coords === alloca(d * sizeof("size_t")).cast(coords)
+      pr coords_out === alloca(d * sizeof("size_t")).cast(coords_out) if shuffle
+      pr elem_number === alloca(d * sizeof("size_t")).cast(elem_number)
+      pr elem_number2 === alloca(d * sizeof("size_t")).cast(elem_number2)
+      pr FuncCall( :aml_layout_adims_column, src, elem_number )
+      pr FuncCall( :aml_layout_adims_column, dst, elem_number2 )
+      pr For(i, 0, d, operator: '<', declit: true) {
+        pr assert( "#{elem_number}[#{src_index[i]}] == #{elem_number2}[#{dst_index[i]}]" )
+      }
+
+      new_args = [d, dst, src, elem_number, elem_size, coords]
+      new_args << coords_out << target_dims if shuffle
+
+      pr aml_copy_layout_generic_helper(shuffle: shuffle).call(*new_args)
+      pr Return(0)
+    end
+  }
+end
+
+def aml_copy_layout_transpose(native: true, reverse: false)
+  dst = Pointer :dst, type: CStruct::new(type_name: :aml_layout, members: {}), dir: :inout
+  src = Pointer :src, type: CStruct::new(type_name: :aml_layout, members: {}), dir: :in
+
+  dsrc = Pointer :dsrc, type: CStruct::new(type_name: :aml_layout_data_native, members: {})
+  target_dims = Sizet :target_dims, dim: Dim()
+  d = Sizet :d
+  i = Sizet :i
+
+  name = name_prefix + "layout_"
+  name << "reverse_" if reverse
+  name << "transpose_"
+  name << (native ? "native" : "generic")
+  p = Procedure( name, [ dst, src ], return_type: Int ) {
+    decl d
+    decl target_dims
+    decl dsrc
+
+    pr dsrc === "(struct aml_layout_data_native *)#{src}->data"
+    pr d === "#{dsrc}->ndims"
+    pr target_dims === alloca(d * sizeof("size_t")).cast(target_dims)
+    if reverse
+      pr target_dims[0] === d - 1
+      pr For(i, 1, d, operator: '<', declit: true) {
+        pr target_dims[i] === i - 1
+      }
+    else
+      pr target_dims[d - 1] === 0
+      pr For(i, 0, d - 1, operator: '<', declit: true) {
+        pr target_dims[i] === i + 1
+      }
+    end
+    pr Return( aml_copy_layout(native: native, shuffle: true).call( dst, src, target_dims) )
+  }
+end
+
+pr $aml_compute_cumulative_pitch = aml_compute_cumulative_pitch
+
+generation_space = BruteForceOptimizer::new(
+  OptimizationSpace::new(
+    shuffle: [false, true],
+    stride: [false, true]
+  )
+)
+
+transpose_generation_space = BruteForceOptimizer::new(
+  OptimizationSpace::new(
+    stride: [false, true],
+    reverse: [false, true],
+    cumulative: [false, true]
+  )
+)
+
+generation_space.each { |params|
+  pr aml_copy_nd_helper(**params)
+  pr aml_copy_nd_c(**params)
+  pr aml_copy_nd(**params)
+}
+
+transpose_generation_space.each { |params|
+  pr aml_copy_tnd(**params)
+}
+
+pr aml_copy_layout
+pr aml_copy_layout(shuffle: true)
+pr aml_copy_layout_transpose
+pr aml_copy_layout_transpose(reverse: true)
+
+pr aml_copy_layout_generic_helper(shuffle: false)
+pr aml_copy_layout_generic_helper(shuffle: true)
+pr aml_copy_layout(native: false)
+pr aml_copy_layout(native: false, shuffle: true)
+pr aml_copy_layout_transpose(native: false)
+pr aml_copy_layout_transpose(native: false, reverse: true)
+
+stdout0.close
+
+Process.wait(pid1)
+Process.wait(pid2)
+
diff --git a/src/dma_layout.c b/src/dma_layout.c
new file mode 100644
index 00000000..609cab96
--- /dev/null
+++ b/src/dma_layout.c
@@ -0,0 +1,160 @@
+#include <aml.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+/*******************************************************************************
+ * Requests:
+ ******************************************************************************/
+
+int aml_dma_request_layout_init(struct aml_dma_request_layout *req,
+				struct aml_layout *dl,
+				struct aml_layout *sl)
+{
+	assert(req != NULL);
+	req->type = AML_DMA_REQUEST_TYPE_COPY;
+	/* figure out pointers */
+	req->dest = dl;
+	req->src = sl;
+	return 0;
+}
+
+int aml_dma_request_layout_destroy(struct aml_dma_request_layout *r)
+{
+	assert(r != NULL);
+	return 0;
+}
+
+/*******************************************************************************
+ * Public API
+ ******************************************************************************/
+
+int aml_dma_layout_create_request(struct aml_dma_data *d,
+				  struct aml_dma_request **r,
+				  int type, va_list ap)
+{
+	assert(d != NULL);
+	assert(r != NULL);
+	struct aml_dma_layout *dma =
+		(struct aml_dma_layout *)d;
+
+	struct aml_dma_request_layout *req;
+
+	pthread_mutex_lock(&dma->lock);
+	req = aml_vector_add(&dma->requests);
+
+	/* we don't support move at this time */
+	assert(type == AML_DMA_REQUEST_TYPE_COPY);
+	struct aml_layout *dl, *sl;
+	void *arg;
+	dl = va_arg(ap, struct aml_layout *);
+	sl = va_arg(ap, struct aml_layout *);
+	aml_dma_request_layout_init(req, dl, sl);
+
+	pthread_mutex_unlock(&dma->lock);
+	*r = (struct aml_dma_request *)req;
+	return 0;
+}
+
+int aml_dma_layout_destroy_request(struct aml_dma_data *d,
+				   struct aml_dma_request *r)
+{
+	assert(d != NULL);
+	assert(r != NULL);
+	struct aml_dma_layout *dma =
+		(struct aml_dma_layout *)d;
+
+	struct aml_dma_request_layout *req =
+		(struct aml_dma_request_layout *)r;
+
+	assert(req->type == AML_DMA_REQUEST_TYPE_COPY);
+	aml_dma_request_layout_destroy(req);
+
+	/* enough to remove from request vector */
+	pthread_mutex_lock(&dma->lock);
+	aml_vector_remove(&dma->requests, req);
+	pthread_mutex_unlock(&dma->lock);
+	return 0;
+}
+
+int aml_dma_layout_wait_request(struct aml_dma_data *d,
+				   struct aml_dma_request *r)
+{
+	assert(d != NULL);
+	assert(r != NULL);
+	struct aml_dma_layout *dma = (struct aml_dma_layout *)d;
+	struct aml_dma_request_layout *req =
+		(struct aml_dma_request_layout *)r;
+
+	/* execute */
+	assert(req->type == AML_DMA_REQUEST_TYPE_COPY);
+	dma->do_work(req->dest, req->src, dma->work_arg);
+
+	/* destroy a completed request */
+	aml_dma_layout_destroy_request(d, r);
+	return 0;
+}
+
+struct aml_dma_ops aml_dma_ops_layout = {
+	aml_dma_layout_create_request,
+	aml_dma_layout_destroy_request,
+	aml_dma_layout_wait_request,
+};
+
+/*******************************************************************************
+ * Init functions:
+ ******************************************************************************/
+
+int aml_dma_layout_create(struct aml_dma **d, ...)
+{
+	va_list ap;
+	struct aml_dma *ret = NULL;
+	intptr_t baseptr, dataptr;
+	va_start(ap, d);
+
+	/* alloc */
+	baseptr = (intptr_t) calloc(1, AML_DMA_LAYOUT_ALLOCSIZE);
+	dataptr = baseptr + sizeof(struct aml_dma);
+
+	ret = (struct aml_dma *)baseptr;
+	ret->data = (struct aml_dma_data *)dataptr;
+
+	aml_dma_layout_vinit(ret, ap);
+
+	va_end(ap);
+	*d = ret;
+	return 0;
+}
+int aml_dma_layout_vinit(struct aml_dma *d, va_list ap)
+{
+	d->ops = &aml_dma_ops_layout;
+	struct aml_dma_layout *dma = (struct aml_dma_layout *)d->data;
+
+	/* request vector */
+	size_t nbreqs = va_arg(ap, size_t);
+	dma->do_work = va_arg(ap, aml_dma_operator);
+	dma->work_arg = va_arg(ap, void *);
+	aml_vector_init(&dma->requests, nbreqs,
+			sizeof(struct aml_dma_request_layout),
+			offsetof(struct aml_dma_request_layout, type),
+			AML_DMA_REQUEST_TYPE_INVALID);
+	pthread_mutex_init(&dma->lock, NULL);
+	return 0;
+}
+int aml_dma_layout_init(struct aml_dma *d, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, d);
+	err = aml_dma_layout_vinit(d, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_dma_layout_destroy(struct aml_dma *d)
+{
+	struct aml_dma_layout *dma = (struct aml_dma_layout *)d->data;
+	aml_vector_destroy(&dma->requests);
+	pthread_mutex_destroy(&dma->lock);
+	return 0;
+}
diff --git a/src/layout.c b/src/layout.c
new file mode 100644
index 00000000..0d550fa2
--- /dev/null
+++ b/src/layout.c
@@ -0,0 +1,138 @@
+#include <aml.h>
+
+/*******************************************************************************
+ * General API: common operators:
+ ******************************************************************************/
+
+void *aml_layout_deref(const struct aml_layout *layout, ...)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	va_list ap;
+	void *ret;
+	va_start(ap, layout);
+	ret = layout->ops->deref(layout->data, ap);
+	va_end(ap);
+	return ret;
+}
+
+void *aml_layout_aderef(const struct aml_layout *layout, const size_t *coords)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	return layout->ops->aderef(layout->data, coords);
+}
+
+void *aml_layout_aderef_column(const struct aml_layout *layout,
+			       const size_t *coords)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	return layout->ops->aderef_column(layout->data, coords);
+}
+
+int aml_layout_order(const struct aml_layout *layout)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	return layout->ops->order(layout->data);
+}
+
+int aml_layout_dims(const struct aml_layout *layout, ...)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	va_list ap;
+	int ret;
+	va_start(ap, layout);
+	ret = layout->ops->dims(layout->data, ap);
+	va_end(ap);
+	return ret;
+}
+
+int aml_layout_adims(const struct aml_layout *layout, size_t *dims)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	return layout->ops->adims(layout->data, dims);
+}
+
+int aml_layout_adims_column(const struct aml_layout *layout, size_t *dims)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	return layout->ops->adims_column(layout->data, dims);
+}
+
+size_t aml_layout_ndims(const struct aml_layout *layout)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	return layout->ops->ndims(layout->data);
+}
+
+size_t aml_layout_element_size(const struct aml_layout *layout)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	return layout->ops->element_size(layout->data);
+}
+
+struct aml_layout * aml_layout_areshape(const struct aml_layout *layout,
+					size_t ndims, const size_t *dims)
+{
+	assert(ndims != 0);
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	assert(layout->ops->areshape != NULL);
+	return layout->ops->areshape(layout->data, ndims, dims);
+}
+
+struct aml_layout * aml_layout_reshape(const struct aml_layout *layout,
+				       size_t ndims, ...)
+{
+	assert(ndims != 0);
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	assert(layout->ops->reshape != NULL);
+	va_list ap;
+	struct aml_layout *ret;
+	va_start(ap, ndims);
+	ret = layout->ops->reshape(layout->data, ndims, ap);
+	va_end(ap);
+	return ret;
+}
+
+struct aml_layout * aml_layout_slice(const struct aml_layout *layout, ...)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	assert(layout->ops->slice != NULL);
+        va_list ap;
+	struct aml_layout *ret;
+	va_start(ap, layout);
+	ret = layout->ops->slice(layout->data, ap);
+        va_end(ap);
+	return ret;
+}
+
+struct aml_layout * aml_layout_aslice(const struct aml_layout *layout,
+				      const size_t *offsets, const size_t *dims,
+				      const size_t *strides)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	assert(layout->ops->aslice != NULL);
+	return layout->ops->aslice(layout->data, offsets, dims, strides);
+}
+
+struct aml_layout * aml_layout_aslice_column(const struct aml_layout *layout,
+					     const size_t *offsets,
+					     const size_t *dims,
+					     const size_t *strides)
+{
+	assert(layout != NULL);
+	assert(layout->ops != NULL);
+	assert(layout->ops->aslice != NULL);
+	return layout->ops->aslice_column(layout->data, offsets, dims, strides);
+}
diff --git a/src/layout_dense.c b/src/layout_dense.c
new file mode 100644
index 00000000..2c7ea9ee
--- /dev/null
+++ b/src/layout_dense.c
@@ -0,0 +1,638 @@
+#include <aml.h>
+
+/*******************************************************************************
+ * Native layout initialization:
+ ******************************************************************************/
+
+int aml_layout_native_struct_init(struct aml_layout *layout, size_t ndims,
+				  void *memory)
+{
+	struct aml_layout_data_native *dataptr;
+
+	assert(layout == (struct aml_layout *)memory);
+	memory = (void *)((uintptr_t)memory +
+		      sizeof(struct aml_layout));
+	dataptr = memory;
+	layout->data = memory;
+	memory = (void *)((uintptr_t)memory +
+		      sizeof(struct aml_layout_data_native));
+	dataptr->ndims = ndims;
+	dataptr->dims = (size_t *)memory;
+	dataptr->stride = dataptr->dims + ndims;
+	dataptr->pitch = dataptr->stride + ndims;
+	dataptr->cpitch = dataptr->pitch + ndims;
+	return 0;
+}
+
+static
+int aml_layout_native_ainit_cpitch(struct aml_layout *layout,
+				   uint64_t tags, void *ptr, size_t ndims,
+				   const size_t *dims, const size_t *stride,
+				   const size_t *cpitch)
+{
+	struct aml_layout_data_native *data =
+	    (struct aml_layout_data_native *)layout->data;
+	layout->tags = tags;
+	data->ptr = ptr;
+	memcpy(data->dims, dims, ndims * sizeof(size_t));
+	memcpy(data->stride, stride, ndims * sizeof(size_t));
+	memset(data->pitch, 0, ndims * sizeof(size_t));
+	memcpy(data->cpitch, cpitch, (ndims + 1) * sizeof(size_t));
+	return 0;
+}
+
+int aml_layout_native_ainit(struct aml_layout *layout, uint64_t tags, void *ptr,
+			    const size_t element_size, size_t ndims,
+			    const size_t *dims, const size_t *stride,
+			    const size_t *pitch)
+{
+	assert(layout != NULL);
+	assert(layout->data != NULL);
+	struct aml_layout_data_native *data =
+	    (struct aml_layout_data_native *)layout->data;
+	assert(data->ndims == ndims);
+	assert(data->dims);
+	assert(data->stride);
+	assert(data->pitch);
+	assert(data->cpitch);
+	data->ptr = ptr;
+	int type = AML_TYPE_GET(tags, AML_TYPE_LAYOUT_ORDER);
+	if(type == AML_TYPE_LAYOUT_ROW_ORDER)
+	{
+		AML_TYPE_SET(layout->tags, AML_TYPE_LAYOUT_ORDER,
+			     AML_TYPE_LAYOUT_ROW_ORDER);
+		layout->ops = &aml_layout_row_ops;
+		for(size_t i = 0; i < ndims; i++)
+		{
+			data->dims[i] = dims[ndims-i-1];
+			data->stride[i] = stride[ndims-i-1];
+			data->pitch[i] = pitch[ndims-i-1];
+		}
+		data->cpitch[0] = element_size;
+		for(size_t i = 1; i <= ndims; i++)
+			data->cpitch[i] = data->cpitch[i-1]*pitch[ndims-i];
+	}
+	else if(type == AML_TYPE_LAYOUT_COLUMN_ORDER)
+	{
+		AML_TYPE_SET(layout->tags, AML_TYPE_LAYOUT_ORDER,
+			     AML_TYPE_LAYOUT_COLUMN_ORDER);
+		layout->ops = &aml_layout_column_ops;
+		memcpy(data->dims, dims, ndims * sizeof(size_t));
+		memcpy(data->stride, stride, ndims * sizeof(size_t));
+		memcpy(data->pitch, pitch, ndims * sizeof(size_t));
+		data->cpitch[0] = element_size;
+		for(size_t i = 1; i <= ndims; i++)
+			data->cpitch[i] = data->cpitch[i-1]*pitch[i-1];
+	}
+	return 0;
+}
+
+int aml_layout_native_vinit(struct aml_layout *p, uint64_t tags, void *ptr,
+			    const size_t element_size, size_t ndims, va_list ap)
+{
+	size_t dims[ndims];
+	size_t stride[ndims];
+	size_t pitch[ndims-1];
+	for(size_t i = 0; i < ndims; i++)
+		dims[i] = va_arg(ap, size_t);
+	for(size_t i = 0; i < ndims; i++)
+		stride[i] = va_arg(ap, size_t);
+	for(size_t i = 0; i < ndims; i++)
+		pitch[i] = va_arg(ap, size_t);
+	return aml_layout_native_ainit(p, tags, ptr, element_size, ndims, dims,
+				       stride, pitch);
+}
+
+int aml_layout_native_init(struct aml_layout *p, uint64_t tags, void *ptr,
+			   const size_t element_size, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, ndims);
+	err = aml_layout_native_vinit(p, tags, ptr, element_size, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_layout_native_acreate(struct aml_layout **layout, uint64_t tags,
+			      void *ptr, const size_t element_size,
+			      size_t ndims, const size_t *dims,
+			      const size_t *stride, const size_t *pitch)
+{
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(*layout, ndims, baseptr);
+	return aml_layout_native_ainit(*layout, tags, ptr, element_size, ndims,
+				       dims, stride, pitch);
+}
+
+int aml_layout_native_vcreate(struct aml_layout **layout, uint64_t tags,
+			      void *ptr, const size_t element_size,
+			      size_t ndims, va_list ap)
+{
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(*layout, ndims, baseptr);
+	return aml_layout_native_vinit(*layout, tags, ptr, element_size, ndims,
+				       ap);
+}
+
+int aml_layout_native_create(struct aml_layout **layout, uint64_t tags,
+			     void *ptr, const size_t element_size, size_t ndims,
+			     ...)
+{
+	int err;
+	va_list ap;
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(*layout, ndims, baseptr);
+	va_start(ap, ndims);
+	err = aml_layout_native_vinit(*layout, tags, ptr, element_size, ndims,
+				      ap);
+	va_end(ap);
+	return err;
+}
+
+/*******************************************************************************
+ * COLUMN OPERATORS:
+ ******************************************************************************/
+
+void *aml_layout_column_deref(const struct aml_layout_data *data,
+			      va_list coords)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	void *ptr;
+	assert(d != NULL);
+	assert(d->ptr != NULL);
+	ptr = d->ptr;
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t c = va_arg(coords, size_t);
+		assert(c < d->dims[i]);
+		ptr += c*d->cpitch[i]*d->stride[i];
+	}
+	return ptr;
+}
+
+void *aml_layout_column_aderef(const struct aml_layout_data *data,
+			       const size_t *coords)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	void *ptr;
+	assert(d != NULL);
+	assert(d->ptr != NULL);
+	ptr = d->ptr;
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		assert(coords[i] < d->dims[i]);
+		ptr += coords[i]*d->cpitch[i]*d->stride[i];
+	}
+	return ptr;
+}
+
+int aml_layout_column_order(const struct aml_layout_data *data)
+{
+	return AML_TYPE_LAYOUT_COLUMN_ORDER;
+}
+
+int aml_layout_column_dims(const struct aml_layout_data *data, va_list dims)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t *dim = va_arg(dims, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[i];
+	}
+	return 0;
+}
+
+int aml_layout_column_adims(const struct aml_layout_data *data, size_t *dims)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	assert(d != NULL);
+	assert(dims != NULL);
+	memcpy((void*)dims, (void*)d->dims, sizeof(size_t)*d->ndims);
+	return 0;
+}
+
+size_t aml_layout_column_ndims(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	return d->ndims;
+}
+
+size_t aml_layout_column_element_size(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	return d->cpitch[0];
+}
+
+static void merge_dims(size_t ndims,
+		       const size_t *dims, const size_t *stride,
+		       const size_t *cpitch, size_t *new_ndims,
+		       size_t *new_dims, size_t *new_stride,
+		       size_t *new_cpitch)
+{
+	size_t dim_index = 0;
+	size_t new_dim_index = 0;
+	new_dims[new_dim_index] = dims[dim_index];
+	new_cpitch[new_dim_index] = cpitch[dim_index];
+	new_stride[new_dim_index] = stride[dim_index];
+	for (; dim_index < ndims - 1; dim_index++) {
+		if (dims[dim_index] * stride[dim_index] * cpitch[dim_index] ==
+		    cpitch[dim_index + 1] && stride[dim_index + 1] == 1) {
+			new_dims[new_dim_index] *= dims[dim_index + 1];
+		} else {
+			new_dim_index++;
+			new_dims[new_dim_index] = dims[dim_index + 1];
+			new_cpitch[new_dim_index] = cpitch[dim_index + 1];
+			new_stride[new_dim_index] = stride[dim_index + 1];
+		}	
+	}
+	new_cpitch[new_dim_index + 1] = cpitch[dim_index + 1];
+	*new_ndims = new_dim_index + 1;
+}
+
+static void
+reshape_dims(const struct aml_layout_data_native *d, size_t ndims,
+	     const size_t *dims, size_t *n_stride, size_t *n_cpitch)
+{
+	size_t m_ndims;
+	size_t m_dims[d->ndims];
+	size_t m_stride[d->ndims];
+	size_t m_cpitch[d->ndims + 1];
+
+	merge_dims(d->ndims, d->dims, d->stride, d->cpitch,
+		   &m_ndims, m_dims, m_stride, m_cpitch);
+
+	size_t m_dim_index = 0;
+
+	n_cpitch[0] = m_cpitch[m_dim_index];
+	for (size_t i = 0; i < ndims; i++) {
+		if (m_dims[m_dim_index] == dims[i]) {
+			n_stride[i] = m_stride[m_dim_index];
+			n_cpitch[i + 1] = m_cpitch[m_dim_index + 1];
+			m_dim_index++;
+		} else if (m_dims[m_dim_index] % dims[i] == 0) {
+			m_dims[m_dim_index] /= dims[i];
+			n_stride[i] = m_stride[m_dim_index];
+			n_cpitch[i + 1] =
+			    n_cpitch[i] * dims[i] * m_stride[m_dim_index];
+			m_stride[m_dim_index] = 1;
+		} else {
+			assert(0);
+		}
+	}
+}
+			
+struct aml_layout *
+aml_layout_column_areshape(const struct aml_layout_data *data, size_t ndims,
+			   const size_t *dims)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	size_t total_size, new_total_size;
+	total_size = d->dims[0];
+	for (size_t i = 1; i < d->ndims; i++)
+		total_size *= d->dims[i];
+	new_total_size = dims[0];
+	for (size_t i = 1; i < ndims; i++)
+		new_total_size *= dims[i];
+	assert(total_size == total_size);
+
+	size_t stride[ndims];
+	size_t cpitch[ndims + 1];
+	reshape_dims(d, ndims, dims, stride, cpitch);
+	
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	struct aml_layout *layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(layout, ndims, baseptr);
+
+	aml_layout_native_ainit_cpitch(layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				       d->ptr, ndims, dims, stride, cpitch);
+	layout->ops = &aml_layout_column_ops;
+
+	return layout;
+}
+
+struct aml_layout *
+aml_layout_column_reshape(const struct aml_layout_data *data, size_t ndims,
+			  va_list dims)
+{
+	size_t n_dims[ndims];
+	for (int i = 0; i < ndims; i++) {
+		n_dims[i] = va_arg(dims, size_t);
+	}
+	return aml_layout_column_areshape(data, ndims, n_dims);
+}
+
+struct aml_layout *
+aml_layout_column_aslice(const struct aml_layout_data *data,
+			 const size_t *offsets, const size_t *dims,
+			 const size_t *strides)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	size_t ndims = d->ndims;
+	for (size_t i = 0; i < ndims; i++)
+		assert(offsets[i] + (dims[i] - 1) * strides[i] < d->dims[i]);
+        void * ptr = aml_layout_column_aderef(data, offsets);
+	size_t cpitch[ndims + 1];
+	size_t new_strides[ndims];
+        cpitch[ndims] = d->cpitch[ndims];
+        for (size_t i = 0; i < ndims; i++) {
+		cpitch[i] = d->cpitch[i];
+		new_strides[i] = strides[i] * d->stride[i];
+		cpitch[ndims] -= cpitch[i] * offsets[i] * d->stride[i];
+	}
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	struct aml_layout *layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(layout, ndims, baseptr);
+
+	aml_layout_native_ainit_cpitch(layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				       ptr, ndims, dims, new_strides, cpitch);
+	layout->ops = &aml_layout_column_ops;
+
+	return layout;
+}
+
+struct aml_layout *
+aml_layout_column_slice(const struct aml_layout_data *data, va_list args)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	size_t ndims = d->ndims;
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+	for (int i = 0; i < ndims; i++)
+		offsets[i] = va_arg(args, size_t);
+	for (int i = 0; i < ndims; i++)
+		dims[i] = va_arg(args, size_t);
+	for (int i = 0; i < ndims; i++)
+		strides[i] = va_arg(args, size_t);
+	return aml_layout_column_aslice(data, offsets, dims, strides);
+}
+
+struct aml_layout_ops aml_layout_column_ops = {
+	aml_layout_column_deref,
+	aml_layout_column_aderef,
+	aml_layout_column_aderef,
+	aml_layout_column_order,
+	aml_layout_column_dims,
+	aml_layout_column_adims,
+	aml_layout_column_adims,
+	aml_layout_column_ndims,
+	aml_layout_column_element_size,
+	aml_layout_column_reshape,
+	aml_layout_column_areshape,
+        aml_layout_column_slice,
+	aml_layout_column_aslice,
+	aml_layout_column_aslice
+};
+
+/*******************************************************************************
+ * ROW OPERATORS:
+ ******************************************************************************/
+
+void *aml_layout_row_deref(const struct aml_layout_data *data, va_list coords)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	void *ptr;
+	assert(d != NULL);
+	assert(d->ptr != NULL);
+	ptr = d->ptr;
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t c = va_arg(coords, size_t);
+		assert(c < d->dims[d->ndims - i - 1]);
+		ptr += c * d->cpitch[d->ndims - i - 1] *
+			   d->stride[d->ndims - i - 1];
+	}
+	return ptr;
+}
+
+void *aml_layout_row_aderef(const struct aml_layout_data *data,
+			    const size_t *coords)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	void *ptr;
+	assert(d != NULL);
+	assert(d->ptr != NULL);
+	ptr = d->ptr;
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t c = coords[i];
+		assert(c < d->dims[d->ndims - i - 1]);
+		ptr += c * d->cpitch[d->ndims - i - 1] *
+			   d->stride[d->ndims - i - 1];
+	}
+	return ptr;
+}
+
+int aml_layout_row_order(const struct aml_layout_data *data)
+{
+	return AML_TYPE_LAYOUT_ROW_ORDER;
+}
+
+int aml_layout_row_dims(const struct aml_layout_data *data, va_list dims)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t *dim = va_arg(dims, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int aml_layout_row_adims(const struct aml_layout_data *data, size_t *dims)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		dims[i] = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+size_t aml_layout_row_ndims(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	return d->ndims;
+}
+
+size_t aml_layout_row_element_size(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	return d->cpitch[0];
+}
+
+struct aml_layout *
+aml_layout_row_areshape(const struct aml_layout_data *data, size_t ndims,
+		        const size_t *dims)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	size_t total_size, new_total_size;
+	total_size = d->dims[0];
+	for (size_t i = 1; i < d->ndims; i++)
+		total_size *= d->dims[i];
+	new_total_size = dims[0];
+	for (size_t i = 1; i < ndims; i++)
+		new_total_size *= dims[i];
+	assert(total_size == total_size);
+
+	size_t n_dims[ndims];
+	for (int i = 0; i < ndims; i++)
+		n_dims[ndims - i - 1] = dims[i];
+	
+	size_t stride[ndims];
+	size_t cpitch[ndims + 1];
+	reshape_dims(d, ndims, n_dims, stride, cpitch);
+	
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	struct aml_layout *layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(layout, ndims, baseptr);
+
+	aml_layout_native_ainit_cpitch(layout, AML_TYPE_LAYOUT_ROW_ORDER,
+				       d->ptr, ndims, n_dims, stride, cpitch);
+	layout->ops = &aml_layout_row_ops;
+
+	return layout;
+}
+
+struct aml_layout *
+aml_layout_row_reshape(const struct aml_layout_data *data, size_t ndims,
+		       va_list dims)
+{
+	size_t n_dims[ndims];
+	for (int i = 0; i < ndims; i++)
+		n_dims[i] = va_arg(dims, size_t);
+	return aml_layout_row_areshape(data, ndims, n_dims);
+}
+
+
+struct aml_layout *
+aml_layout_row_aslice(const struct aml_layout_data *data,
+			 const size_t *offsets, const size_t *dims,
+			 const size_t *strides)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	size_t ndims = d->ndims;
+	size_t n_offsets[ndims];
+	size_t n_dims[ndims];
+	size_t n_strides[ndims];
+	for (size_t i = 0; i < ndims; i++) {
+		n_offsets[i] = offsets[ndims - i - 1];
+		n_dims[i] = dims[ndims - i - 1];
+		n_strides[i] = strides[ndims - i - 1];
+	}
+	for (size_t i = 0; i < ndims; i++)
+		assert(n_offsets[i] + (n_dims[i] - 1) * n_strides[i] <
+			   d->dims[i]);
+        void * ptr = aml_layout_column_aderef(data, n_offsets);
+	size_t cpitch[ndims + 1];
+        cpitch[ndims] = d->cpitch[ndims];
+        for (size_t i = 0; i < ndims; i++) {
+		cpitch[i] = d->cpitch[i];
+		n_strides[i] *= d->stride[i];
+		cpitch[ndims] -= cpitch[i] * n_offsets[i] * d->stride[i];
+	}
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	struct aml_layout *layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(layout, ndims, baseptr);
+
+	aml_layout_native_ainit_cpitch(layout, AML_TYPE_LAYOUT_ROW_ORDER,
+				       ptr, ndims, n_dims, n_strides, cpitch);
+	layout->ops = &aml_layout_row_ops;
+
+	return layout;
+}
+
+struct aml_layout *
+aml_layout_row_slice(const struct aml_layout_data *data, va_list args)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	size_t ndims = d->ndims;
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+	for (int i = 0; i < ndims; i++)
+		offsets[i] = va_arg(args, size_t);
+	for (int i = 0; i < ndims; i++)
+		dims[i] = va_arg(args, size_t);
+	for (int i = 0; i < ndims; i++)
+		strides[i] = va_arg(args, size_t);
+	return aml_layout_row_aslice(data, offsets, dims, strides);
+}
+
+struct aml_layout *
+aml_layout_row_aslice_column(const struct aml_layout_data *data,
+			     const size_t *offsets, const size_t *dims,
+			     const size_t *strides)
+{
+	const struct aml_layout_data_native *d =
+	    (const struct aml_layout_data_native *)data;
+	size_t ndims = d->ndims;
+	for (size_t i = 0; i < ndims; i++)
+		assert(offsets[i] + (dims[i] - 1) * strides[i] < d->dims[i]);
+        void * ptr = aml_layout_column_aderef(data, offsets);
+	size_t cpitch[ndims + 1];
+	size_t new_strides[ndims];
+        cpitch[ndims] = d->cpitch[ndims];
+        for (size_t i = 0; i < ndims; i++) {
+		cpitch[i] = d->cpitch[i];
+		new_strides[i] = strides[i] * d->stride[i];
+		cpitch[ndims] -= cpitch[i] * offsets[i] * d->stride[i];
+	}
+	void *baseptr = calloc(1, AML_LAYOUT_NATIVE_ALLOCSIZE(ndims));
+	struct aml_layout *layout = (struct aml_layout *)baseptr;
+	aml_layout_native_struct_init(layout, ndims, baseptr);
+
+	aml_layout_native_ainit_cpitch(layout, AML_TYPE_LAYOUT_ROW_ORDER,
+				       ptr, ndims, dims, new_strides, cpitch);
+	layout->ops = &aml_layout_row_ops;
+
+	return layout;
+}
+
+struct aml_layout_ops aml_layout_row_ops = {
+	aml_layout_row_deref,
+	aml_layout_row_aderef,
+	aml_layout_column_aderef,
+	aml_layout_row_order,
+	aml_layout_row_dims,
+	aml_layout_row_adims,
+	aml_layout_column_adims,
+	aml_layout_row_ndims,
+	aml_layout_row_element_size,
+	aml_layout_row_reshape,
+	aml_layout_row_areshape,
+	aml_layout_row_slice,
+	aml_layout_row_aslice,
+	aml_layout_row_aslice_column
+};
+
diff --git a/src/layout_pad.c b/src/layout_pad.c
new file mode 100644
index 00000000..3e1564bd
--- /dev/null
+++ b/src/layout_pad.c
@@ -0,0 +1,319 @@
+#include <aml.h>
+
+int aml_layout_pad_struct_init(struct aml_layout *layout, size_t ndims,
+			       size_t element_size, void *memory)
+{
+	struct aml_layout_data_pad *dataptr;
+
+	assert(layout == (struct aml_layout *)memory);
+	memory = (void *)((uintptr_t)memory + sizeof(struct aml_layout));
+	dataptr = memory;
+	layout->data = memory;
+	memory = (void *)((uintptr_t)memory +
+		      sizeof(struct aml_layout_data_pad));
+	dataptr->target = NULL;
+	dataptr->ndims = ndims;
+	dataptr->element_size = element_size;
+	dataptr->dims = (size_t *)memory;
+	dataptr->target_dims = dataptr->dims + ndims;
+	dataptr->neutral = (void *)(dataptr->target_dims + ndims);
+	return 0;
+}
+
+int aml_layout_pad_ainit(struct aml_layout *layout, uint64_t tags,
+			 struct aml_layout *target, const size_t *dims,
+			 void *neutral)
+{
+	assert(layout != NULL);
+	assert(layout->data != NULL);
+	struct aml_layout_data_pad *data =
+	    (struct aml_layout_data_pad *)layout->data;
+	size_t ndims = aml_layout_ndims(target);
+	size_t element_size = aml_layout_element_size(target);
+	assert(data->ndims == ndims);
+	assert(data->element_size == element_size);
+	assert(data->dims);
+	assert(data->target_dims);
+	assert(data->neutral);
+	int type = AML_TYPE_GET(tags, AML_TYPE_LAYOUT_ORDER);
+	if (type == AML_TYPE_LAYOUT_ROW_ORDER) {
+		AML_TYPE_SET(layout->tags, AML_TYPE_LAYOUT_ORDER,
+			     AML_TYPE_LAYOUT_ROW_ORDER);
+		layout->ops = &aml_layout_pad_row_ops;
+		for(size_t i = 0; i < ndims; i++)
+			data->dims[i] = dims[ndims-i-1];
+	} else if (type == AML_TYPE_LAYOUT_COLUMN_ORDER) {
+		AML_TYPE_SET(layout->tags, AML_TYPE_LAYOUT_ORDER,
+			     AML_TYPE_LAYOUT_COLUMN_ORDER);
+		layout->ops = &aml_layout_pad_column_ops;
+		memcpy(data->dims, dims, ndims * sizeof(size_t));
+	}
+	type = aml_layout_order(target);
+	if(type == AML_TYPE_LAYOUT_ROW_ORDER) {
+		size_t target_dims[ndims];
+		aml_layout_adims(target, target_dims);
+		for(size_t i = 0; i < ndims; i++)
+			data->target_dims[i] = target_dims[ndims-i-1];
+	} else if (type == AML_TYPE_LAYOUT_COLUMN_ORDER) {
+		aml_layout_adims(target, data->target_dims);
+	}
+	for(size_t i = 0; i < ndims; i++)
+		assert(data->dims[i] >= data->target_dims[i]);
+	memcpy(data->neutral, neutral, element_size);
+	data->target = target;
+	return 0;
+}
+
+int aml_layout_pad_vinit(struct aml_layout *layout, uint64_t tags,
+			 struct aml_layout *target, va_list ap)
+{
+	size_t ndims = aml_layout_ndims(target);
+	size_t dims[ndims];
+	for(size_t i = 0; i < ndims; i++)
+		dims[i] = va_arg(ap, size_t);
+	void *neutral = va_arg(ap, void *);
+	return aml_layout_pad_ainit(layout, tags, target, dims, neutral);
+}
+
+int aml_layout_pad_init(struct aml_layout *layout, uint64_t tags,
+			struct aml_layout *target, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, target);
+	err = aml_layout_pad_vinit(layout, tags, target, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_layout_pad_acreate(struct aml_layout **layout, uint64_t tags,
+			   struct aml_layout *target, const size_t *dims,
+			   void *neutral)
+{
+	assert(target != NULL);
+	assert(target->ops != NULL);
+	size_t ndims = aml_layout_ndims(target);
+	size_t element_size = aml_layout_element_size(target);
+	void *baseptr = calloc(1, AML_LAYOUT_PAD_ALLOCSIZE(ndims,
+							   element_size));
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_pad_struct_init(*layout, ndims, element_size, baseptr);
+	return aml_layout_pad_ainit(*layout, tags, target, dims, neutral);
+}
+
+int aml_layout_pad_vcreate(struct aml_layout **layout, uint64_t tags,
+			   struct aml_layout *target, va_list ap)
+{
+	assert(target != NULL);
+	assert(target->ops != NULL);
+	size_t ndims = aml_layout_ndims(target);
+	size_t element_size = aml_layout_element_size(target);
+	void *baseptr = calloc(1, AML_LAYOUT_PAD_ALLOCSIZE(ndims,
+							   element_size));
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_pad_struct_init(*layout, ndims, element_size, baseptr);
+	return aml_layout_pad_vinit(*layout, tags, target, ap);
+}
+
+int aml_layout_pad_create(struct aml_layout **layout, uint64_t tags,
+			  struct aml_layout *target, ...)
+{
+	int err;
+	va_list ap;
+	assert(target != NULL);
+	assert(target->ops != NULL);
+	size_t ndims = aml_layout_ndims(target);
+	size_t element_size = aml_layout_element_size(target);
+	void *baseptr = calloc(1, AML_LAYOUT_PAD_ALLOCSIZE(ndims,
+							   element_size));
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_pad_struct_init(*layout, ndims, element_size, baseptr);
+	va_start(ap, target);
+	err = aml_layout_pad_vinit(*layout, tags, target, ap);
+	va_end(ap);
+	return err;
+}
+
+/*******************************************************************************
+ * COLUMN OPERATORS:
+ ******************************************************************************/
+
+void *aml_layout_pad_column_aderef(const struct aml_layout_data *data,
+				   const size_t *coords)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d !=NULL);
+	size_t ndims = d->ndims;
+	for (int i = 0; i < ndims; i++)
+		assert(coords[i] < d->dims[i]);
+	for (int i = 0; i < ndims; i++) {
+		if(coords[i] >= d->target_dims[i])
+			return d->neutral;
+	}
+	return d->target->ops->aderef_column(d->target->data, coords);
+}
+
+void *aml_layout_pad_column_deref(const struct aml_layout_data *data,
+				  va_list coords)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d !=NULL);
+	size_t ndims = d->ndims;
+	size_t target_coords[d->ndims];
+	for (int i = 0; i < ndims; i++)
+		target_coords[i] = va_arg(coords, size_t);
+	return aml_layout_pad_column_aderef(data, target_coords); 
+}
+
+int aml_layout_pad_column_order(const struct aml_layout_data *data)
+{
+	return AML_TYPE_LAYOUT_COLUMN_ORDER;
+}
+
+int aml_layout_pad_column_dims(const struct aml_layout_data *data, va_list dims)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t *dim = va_arg(dims, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[i];
+	}
+	return 0;
+}
+
+int aml_layout_pad_column_adims(const struct aml_layout_data *data,
+				size_t *dims)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d != NULL);
+	assert(dims != NULL);
+	memcpy((void*)dims, (void*)d->dims, sizeof(size_t)*d->ndims);
+	return 0;
+}
+
+size_t aml_layout_pad_ndims(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	return d->ndims;
+}
+
+size_t aml_layout_pad_element_size(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	return d->element_size;
+}
+
+struct aml_layout_ops aml_layout_pad_column_ops = {
+	aml_layout_pad_column_deref,
+	aml_layout_pad_column_aderef,
+	aml_layout_pad_column_aderef,
+	aml_layout_pad_column_order,
+	aml_layout_pad_column_dims,
+	aml_layout_pad_column_adims,
+	aml_layout_pad_column_adims,
+	aml_layout_pad_ndims,
+	aml_layout_pad_element_size,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
+/*******************************************************************************
+ * ROW OPERATORS:
+ ******************************************************************************/
+
+void *aml_layout_pad_row_aderef(const struct aml_layout_data *data,
+				  const size_t *coords)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d !=NULL);
+	size_t ndims = d->ndims;
+	for (int i = 0; i < ndims; i++)
+		assert(coords[ndims - i - 1] < d->dims[i]);
+	for (int i = 0; i < ndims; i++) {
+		if(coords[ndims - i - 1] >= d->target_dims[i])
+			return d->neutral;
+	}
+	int type = aml_layout_order(d->target);
+	if (type == AML_TYPE_LAYOUT_ROW_ORDER)
+		return aml_layout_aderef(d->target, coords);
+	else {
+		size_t target_coords[ndims];
+		for (int i = 0; i < ndims; i++)
+			target_coords[i] = coords[ndims - i - 1];
+		return aml_layout_aderef(d->target, coords);
+	}
+}
+
+void *aml_layout_pad_row_deref(const struct aml_layout_data *data,
+				 va_list coords)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d !=NULL);
+	size_t ndims = d->ndims;
+	size_t target_coords[d->ndims];
+	for (int i = 0; i < ndims; i++)
+		target_coords[i] = va_arg(coords, size_t);
+	return aml_layout_pad_row_aderef(data, target_coords); 
+}
+
+int aml_layout_pad_row_order(const struct aml_layout_data *data)
+{
+	return AML_TYPE_LAYOUT_ROW_ORDER;
+}
+
+int aml_layout_pad_row_dims(const struct aml_layout_data *data, va_list dims)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t *dim = va_arg(dims, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int aml_layout_pad_row_adims(const struct aml_layout_data *data, size_t *dims)
+{
+	const struct aml_layout_data_pad *d =
+	    (const struct aml_layout_data_pad *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		dims[i] = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+struct aml_layout_ops aml_layout_pad_row_ops = {
+	aml_layout_pad_row_deref,
+	aml_layout_pad_row_aderef,
+	aml_layout_pad_column_aderef,
+	aml_layout_pad_row_order,
+	aml_layout_pad_row_dims,
+	aml_layout_pad_row_adims,
+	aml_layout_pad_column_adims,
+	aml_layout_pad_ndims,
+	aml_layout_pad_element_size,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
diff --git a/src/layout_reshape.c b/src/layout_reshape.c
new file mode 100644
index 00000000..9cef96d5
--- /dev/null
+++ b/src/layout_reshape.c
@@ -0,0 +1,341 @@
+#include <aml.h>
+
+int aml_layout_reshape_struct_init(struct aml_layout *layout, size_t ndims,
+				   void *memory)
+{
+	struct aml_layout_data_reshape *dataptr;
+
+	assert(layout == (struct aml_layout *)memory);
+	memory = (void *)((uintptr_t)memory + sizeof(struct aml_layout));
+        dataptr = memory;
+	layout->data = memory;
+	memory = (void *)((uintptr_t)memory +
+		     sizeof(struct aml_layout_data_reshape));
+	dataptr->target = NULL;
+	dataptr->ndims = ndims;
+	dataptr->dims = (size_t *)memory;
+	dataptr->coffsets = dataptr->dims + ndims;
+	dataptr->target_dims = dataptr->dims + 2 * ndims;
+	return 0;
+}
+
+int aml_layout_reshape_ainit(struct aml_layout *layout, uint64_t tags,
+			     struct aml_layout *target, size_t ndims,
+			     const size_t *dims)
+{
+	assert(layout != NULL);
+	assert(layout->data != NULL);
+	struct aml_layout_data_reshape *data =
+	    (struct aml_layout_data_reshape *)layout->data;
+	size_t target_ndims = aml_layout_ndims(target);
+	assert(ndims != 0);
+	assert(data->ndims == ndims);
+        assert(data->dims);
+        assert(data->coffsets);
+	assert(data->target_dims);
+	data->target_ndims = target_ndims;
+	data->target = target;
+	assert(data->target_ndims != 0);
+	int type = AML_TYPE_GET(tags, AML_TYPE_LAYOUT_ORDER);
+	if (type == AML_TYPE_LAYOUT_ROW_ORDER) {
+		AML_TYPE_SET(layout->tags, AML_TYPE_LAYOUT_ORDER,
+			     AML_TYPE_LAYOUT_ROW_ORDER);
+		layout->ops = &aml_layout_reshape_row_ops;
+		for(size_t i = 0; i < ndims; i++)
+			data->dims[i] = dims[ndims-i-1];
+	} else {
+		AML_TYPE_SET(layout->tags, AML_TYPE_LAYOUT_ORDER,
+			     AML_TYPE_LAYOUT_COLUMN_ORDER);
+		layout->ops = &aml_layout_reshape_column_ops;
+		memcpy(data->dims, dims, ndims * sizeof(size_t));
+	}
+	type = aml_layout_order(target);
+	if(type == AML_TYPE_LAYOUT_ROW_ORDER) {
+		size_t target_dims[target_ndims];
+		aml_layout_adims(target, target_dims);
+		for(size_t i = 0; i < target_ndims; i++)
+			data->target_dims[i] = target_dims[target_ndims-i-1];
+	} else {
+		aml_layout_adims(target, data->target_dims);
+	}
+	size_t prod, target_prod;
+	prod = 1;
+	for(size_t i = 0; i < ndims; i++) {
+		data->coffsets[i] = prod;
+		prod *= data->dims[i];
+	}
+	target_prod = 1;
+	for(size_t i = 0; i < data->target_ndims; i++)
+		target_prod *= data->target_dims[i];
+	assert(target_prod == prod);
+	return 0;
+}
+
+int aml_layout_reshape_vinit(struct aml_layout *layout, uint64_t tags,
+			     struct aml_layout *target, size_t ndims,
+			     va_list data)
+{
+	size_t dims[ndims];
+	for(size_t i = 0; i < ndims; i++)
+		dims[i] = va_arg(data, size_t);
+	return aml_layout_reshape_ainit(layout, tags, target, ndims, dims);
+}
+
+int aml_layout_reshape_init(struct aml_layout *layout, uint64_t tags,
+			    struct aml_layout *target, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, ndims);
+	err = aml_layout_reshape_vinit(layout, tags, target, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_layout_reshape_acreate(struct aml_layout **layout, uint64_t tags,
+			       struct aml_layout *target, size_t ndims,
+			       const size_t *dims)
+{
+	assert(target != NULL);
+	assert(target->ops != NULL);
+	size_t target_ndims = aml_layout_ndims(target);
+	void *baseptr = calloc(1, AML_LAYOUT_RESHAPE_ALLOCSIZE(ndims,
+							       target_ndims));
+	assert(baseptr != NULL);
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_reshape_struct_init(*layout, ndims, baseptr);
+	return aml_layout_reshape_ainit(*layout, tags, target, ndims, dims);
+}
+
+int aml_layout_reshape_vcreate(struct aml_layout **layout, uint64_t tags,
+			       struct aml_layout *target, size_t ndims,
+			       va_list data)
+{
+	assert(target != NULL);
+	assert(target->ops != NULL);
+	size_t target_ndims = aml_layout_ndims(target);
+	void *baseptr = calloc(1, AML_LAYOUT_RESHAPE_ALLOCSIZE(ndims,
+							       target_ndims));
+	assert(baseptr != NULL);
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_reshape_struct_init(*layout, ndims, baseptr);
+	return aml_layout_reshape_vinit(*layout, tags, target, ndims, data);
+}
+
+int aml_layout_reshape_create(struct aml_layout **layout, uint64_t tags,
+			      struct aml_layout *target, size_t ndims, ...)
+{
+	int err;
+	va_list data;
+	assert(target != NULL);
+	assert(target->ops != NULL);
+	size_t target_ndims = aml_layout_ndims(target);
+	void *baseptr = calloc(1, AML_LAYOUT_RESHAPE_ALLOCSIZE(ndims,
+							       target_ndims));
+	assert(baseptr != NULL);
+	*layout = (struct aml_layout *)baseptr;
+	aml_layout_reshape_struct_init(*layout, ndims, baseptr);
+	va_start(data, ndims);
+	err = aml_layout_reshape_vinit(*layout, tags, target, ndims, data);
+	va_end(data);
+	return err;
+}
+
+/*******************************************************************************
+ * COLUMN OPERATORS:
+ ******************************************************************************/
+
+void *aml_layout_reshape_column_aderef(const struct aml_layout_data *data,
+				       const size_t *coords)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d !=NULL);
+
+	size_t ndims = d->ndims;
+
+	for (int i = 0; i < ndims; i++)
+		assert(coords[i] < d->dims[i]);
+
+	size_t target_ndims = d->target_ndims;
+	size_t offset = 0;
+	size_t remainder;
+	size_t target_coords[target_ndims];
+
+	for (int i = 0; i < ndims; i++)
+		offset += coords[i] * d->coffsets[i];
+
+	for (int i = 0; i < target_ndims; i++) {
+		target_coords[i] = offset % d->target_dims[i];
+		offset /= d->target_dims[i];
+	}
+	return d->target->ops->aderef_column(d->target->data, target_coords);
+}
+
+void *aml_layout_reshape_column_deref(const struct aml_layout_data *data,
+				      va_list coords)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d !=NULL);
+	size_t target_coords[d->ndims];
+	for (int i = 0; i < d->ndims; i++)
+		target_coords[i] = va_arg(coords, size_t);
+	return aml_layout_reshape_column_aderef(data, target_coords);
+}
+
+int aml_layout_reshape_column_order(const struct aml_layout_data *data)
+{
+	return AML_TYPE_LAYOUT_COLUMN_ORDER;
+}
+
+int aml_layout_reshape_column_dims(const struct aml_layout_data *data, va_list dims)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t *dim = va_arg(dims, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[i];
+	}
+	return 0;
+}
+
+int aml_layout_reshape_column_adims(const struct aml_layout_data *data,
+				size_t *dims)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d != NULL);
+	assert(dims != NULL);
+	memcpy((void*)dims, (void*)d->dims, sizeof(size_t)*d->ndims);
+	return 0;
+}
+
+size_t aml_layout_reshape_ndims(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	return d->ndims;
+}
+
+size_t aml_layout_reshape_element_size(const struct aml_layout_data *data)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	return aml_layout_element_size(d->target);
+}
+
+struct aml_layout_ops aml_layout_reshape_column_ops = {
+	aml_layout_reshape_column_deref,
+	aml_layout_reshape_column_aderef,
+	aml_layout_reshape_column_aderef,
+	aml_layout_reshape_column_order,
+	aml_layout_reshape_column_dims,
+	aml_layout_reshape_column_adims,
+	aml_layout_reshape_column_adims,
+	aml_layout_reshape_ndims,
+	aml_layout_reshape_element_size,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
+/*******************************************************************************
+ * ROW OPERATORS:
+ ******************************************************************************/
+
+void *aml_layout_reshape_row_aderef(const struct aml_layout_data *data,
+				    const size_t *coords)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d !=NULL);
+
+	size_t ndims = d->ndims;
+
+	for (int i = 0; i < ndims; i++)
+		assert(coords[ndims - i - 1] < d->dims[i]);
+
+	size_t target_ndims = d->target_ndims;
+	size_t offset = 0;
+	size_t remainder;
+	size_t target_coords[target_ndims];
+
+	for (int i = 0; i < ndims; i++)
+		offset += coords[ndims - i - 1] * d->coffsets[i];
+
+	for (int i = 0; i < target_ndims; i++) {
+		target_coords[i] = offset % d->target_dims[i];
+		offset /= d->target_dims[i];
+	}
+	return d->target->ops->aderef_column(d->target->data, target_coords);
+}
+
+void *aml_layout_reshape_row_deref(const struct aml_layout_data *data,
+				   va_list coords)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d !=NULL);
+	size_t target_coords[d->ndims];
+	for (int i = 0; i < d->ndims; i++)
+		target_coords[i] = va_arg(coords, size_t);
+	return aml_layout_reshape_row_aderef(data, target_coords);
+}
+
+int aml_layout_reshape_row_order(const struct aml_layout_data *data)
+{
+	return AML_TYPE_LAYOUT_ROW_ORDER;
+}
+
+int aml_layout_reshape_row_dims(const struct aml_layout_data *data,
+				va_list dims)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		size_t *dim = va_arg(dims, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int aml_layout_reshape_row_adims(const struct aml_layout_data *data,
+				 size_t *dims)
+{
+	const struct aml_layout_data_reshape *d =
+	    (const struct aml_layout_data_reshape *)data;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++)
+	{
+		dims[i] = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+struct aml_layout_ops aml_layout_reshape_row_ops = {
+	aml_layout_reshape_row_deref,
+	aml_layout_reshape_row_aderef,
+	aml_layout_reshape_column_aderef,
+	aml_layout_reshape_row_order,
+	aml_layout_reshape_row_dims,
+	aml_layout_reshape_row_adims,
+	aml_layout_reshape_column_adims,
+	aml_layout_reshape_ndims,
+	aml_layout_reshape_element_size,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
+
diff --git a/src/scratch_double.c b/src/scratch_double.c
new file mode 100644
index 00000000..8806e7cf
--- /dev/null
+++ b/src/scratch_double.c
@@ -0,0 +1,287 @@
+#include <aml.h>
+#include <assert.h>
+
+/*******************************************************************************
+ * Requests:
+ ******************************************************************************/
+
+int aml_scratch_request_double_init(struct aml_scratch_request_double *req,
+				    int type, struct aml_dma *dma,
+				    struct aml_layout *dl, int dstid,
+				    struct aml_layout *sl, int srcid)
+
+{
+	assert(req != NULL);
+	req->type = type;
+	req->dma = dma;
+	req->dest = dl;
+	req->dstid = dstid;
+	req->src = sl;
+	req->srcid = srcid;
+	return 0;
+}
+
+int aml_scratch_request_double_destroy(struct aml_scratch_request_double *r)
+{
+	assert(r != NULL);
+	return 0;
+}
+
+/*******************************************************************************
+ * Internal functions
+ ******************************************************************************/
+void *aml_scratch_double_do_thread(void *arg)
+{
+	struct aml_scratch_request_double *req =
+		(struct aml_scratch_request_double *)arg;
+
+	aml_dma_copy(req->dma, req->dest, req->src);
+}
+
+struct aml_scratch_double_ops aml_scratch_double_inner_ops = {
+	aml_scratch_double_do_thread,
+};
+
+/*******************************************************************************
+ * Public API
+ ******************************************************************************/
+
+int aml_scratch_double_create_request(struct aml_scratch_data *d,
+				   struct aml_scratch_request **r,
+				   int type, va_list ap)
+{
+	assert(d != NULL);
+	assert(r != NULL);
+	struct aml_scratch_double *scratch =
+		(struct aml_scratch_double *)d;
+
+	struct aml_scratch_request_double *req;
+
+	pthread_mutex_lock(&scratch->data.lock);
+	req = aml_vector_add(&scratch->data.requests);
+	/* init the request */
+	if(type == AML_SCRATCH_REQUEST_TYPE_PUSH)
+	{
+		struct aml_layout *scratch_layout;
+		struct aml_layout *src_layout;
+		int *src_uid;
+		int scratch_uid;
+
+		src_layout = va_arg(ap, struct aml_layout *);
+		src_uid = va_arg(ap, int *);
+		scratch_layout = va_arg(ap, struct aml_layout *);
+		scratch_uid = va_arg(ap, int);
+
+		/* find destination tile */
+		int *slot = aml_vector_get(&scratch->data.tilemap, scratch_uid);
+		assert(slot != NULL);
+		*src_uid = *slot;
+
+		/* init request */
+		aml_scratch_request_double_init(req, type,
+						scratch->data.push_dma,
+						src_layout, *src_uid,
+						scratch_layout, scratch_uid);
+	}
+	else if(type == AML_SCRATCH_REQUEST_TYPE_PULL)
+	{
+		struct aml_layout **scratch_layout;
+		struct aml_layout *src_layout;
+		int *scratch_uid;
+		int src_uid;
+
+		scratch_layout = va_arg(ap, struct aml_layout **);
+		scratch_uid  = va_arg(ap, int *);
+		src_layout = va_arg(ap, struct aml_layout *);
+		src_uid = va_arg(ap, int);
+
+		/* find scratchination tile
+		 * We don't use add here because adding a tile means allocating
+		 * new tiles on the sch_area too. */
+		int slot = aml_vector_find(&scratch->data.tilemap, src_uid);
+		if(slot == -1)
+		{
+			/* create a new request */
+			slot = aml_vector_find(&scratch->data.tilemap, -1);
+			assert(slot != -1);
+			int *tile = aml_vector_get(&scratch->data.tilemap, slot);
+			*tile = src_uid;
+		}
+		else
+			type = AML_SCRATCH_REQUEST_TYPE_NOOP;
+
+		/* save the key */
+		*scratch_uid = slot;
+		// *scratch_layout = aml_tiling_nd_get(scratch->data.scratch_tiling)
+
+		/* init request */
+		aml_scratch_request_double_init(req, type,
+						scratch->data.pull_dma,
+						*scratch_layout, slot,
+						src_layout, src_uid);
+	}
+	pthread_mutex_unlock(&scratch->data.lock);
+	/* thread creation */
+	if(req->type != AML_SCRATCH_REQUEST_TYPE_NOOP)
+	{
+		pthread_create(&req->thread, NULL, scratch->ops.do_thread, req);
+	}
+	*r = (struct aml_scratch_request *)req;
+	return 0;
+}
+
+int aml_scratch_double_destroy_request(struct aml_scratch_data *d,
+					 struct aml_scratch_request *r)
+{
+	assert(d != NULL);
+	assert(r != NULL);
+	struct aml_scratch_double *scratch =
+		(struct aml_scratch_double *)d;
+
+	struct aml_scratch_request_double *req =
+		(struct aml_scratch_request_double *)r;
+	int *tile;
+
+	if(req->type != AML_SCRATCH_REQUEST_TYPE_NOOP)
+	{
+		pthread_cancel(req->thread);
+		pthread_join(req->thread, NULL);
+	}
+
+	aml_scratch_request_double_destroy(req);
+
+	/* destroy removes the tile from the scratch */
+	pthread_mutex_lock(&scratch->data.lock);
+	if(req->type == AML_SCRATCH_REQUEST_TYPE_PUSH)
+		tile = aml_vector_get(&scratch->data.tilemap,req->srcid);
+	else if(req->type == AML_SCRATCH_REQUEST_TYPE_PULL)
+		tile = aml_vector_get(&scratch->data.tilemap,req->dstid);
+	aml_vector_remove(&scratch->data.tilemap, tile);
+	aml_vector_remove(&scratch->data.requests, req);
+	pthread_mutex_unlock(&scratch->data.lock);
+	return 0;
+}
+
+int aml_scratch_double_wait_request(struct aml_scratch_data *d,
+				   struct aml_scratch_request *r)
+{
+	assert(d != NULL);
+	assert(r != NULL);
+	struct aml_scratch_double *scratch = (struct aml_scratch_double *)d;
+	struct aml_scratch_request_double *req =
+		(struct aml_scratch_request_double *)r;
+	int *tile;
+
+	/* wait for completion of the request */
+	if(req->type != AML_SCRATCH_REQUEST_TYPE_NOOP)
+		pthread_join(req->thread, NULL);
+
+	/* cleanup a completed request. In case of push, free up the tile */
+	aml_scratch_request_double_destroy(req);
+	pthread_mutex_lock(&scratch->data.lock);
+	if(req->type == AML_SCRATCH_REQUEST_TYPE_PUSH)
+	{
+		tile = aml_vector_get(&scratch->data.tilemap,req->srcid);
+		aml_vector_remove(&scratch->data.tilemap, tile);
+	}
+	aml_vector_remove(&scratch->data.requests, req);
+	pthread_mutex_unlock(&scratch->data.lock);
+	return 0;
+}
+
+void *aml_scratch_double_baseptr(const struct aml_scratch_data *d)
+{
+	assert(d != NULL);
+	// don't think this function makes sense for this implementation.
+	return NULL;
+}
+
+int aml_scratch_double_release(struct aml_scratch_data *d, int scratchid)
+{
+	assert(d != NULL);
+	struct aml_scratch_double *scratch = (struct aml_scratch_double *)d;
+	int *tile;
+
+	pthread_mutex_lock(&scratch->data.lock);
+	tile = aml_vector_get(&scratch->data.tilemap, scratchid);
+	if(tile != NULL)
+		aml_vector_remove(&scratch->data.tilemap, tile);
+	pthread_mutex_unlock(&scratch->data.lock);
+	return 0;
+}
+
+struct aml_scratch_ops aml_scratch_double_ops = {
+	aml_scratch_double_create_request,
+	aml_scratch_double_destroy_request,
+	aml_scratch_double_wait_request,
+	aml_scratch_double_baseptr,
+	aml_scratch_double_release,
+};
+
+/*******************************************************************************
+ * Init functions:
+ ******************************************************************************/
+
+int aml_scratch_double_create(struct aml_scratch **d, ...)
+{
+	va_list ap;
+	struct aml_scratch *ret = NULL;
+	intptr_t baseptr, dataptr;
+	va_start(ap, d);
+
+	/* alloc */
+	baseptr = (intptr_t) calloc(1, AML_SCRATCH_DOUBLE_ALLOCSIZE);
+	dataptr = baseptr + sizeof(struct aml_scratch);
+
+	ret = (struct aml_scratch *)baseptr;
+	ret->data = (struct aml_scratch_data *)dataptr;
+
+	aml_scratch_double_vinit(ret, ap);
+
+	va_end(ap);
+	*d = ret;
+	return 0;
+}
+int aml_scratch_double_vinit(struct aml_scratch *d, va_list ap)
+{
+	d->ops = &aml_scratch_double_ops;
+	struct aml_scratch_double *scratch = (struct aml_scratch_double *)d->data;
+
+	scratch->ops = aml_scratch_double_inner_ops;
+
+	scratch->data.dest_tiling = va_arg(ap, struct aml_tiling_nd *);
+	scratch->data.src_tiling = va_arg(ap, struct aml_tiling_nd *);
+	scratch->data.push_dma = va_arg(ap, struct aml_dma *);
+	scratch->data.pull_dma = va_arg(ap, struct aml_dma *);
+	size_t nbtiles = va_arg(ap, size_t);
+	size_t nbreqs = va_arg(ap, size_t);
+
+	/* allocate request array */
+	aml_vector_init(&scratch->data.requests, nbreqs,
+			sizeof(struct aml_scratch_request_double),
+			offsetof(struct aml_scratch_request_double, type),
+			AML_SCRATCH_REQUEST_TYPE_INVALID);
+
+	/* scratch init */
+	aml_vector_init(&scratch->data.tilemap, nbtiles, sizeof(int), 0, -1);
+	pthread_mutex_init(&scratch->data.lock, NULL);
+	return 0;
+}
+int aml_scratch_double_init(struct aml_scratch *d, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, d);
+	err = aml_scratch_double_vinit(d, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_scratch_double_destroy(struct aml_scratch *d)
+{
+	struct aml_scratch_double *scratch = (struct aml_scratch_double *)d->data;
+	aml_vector_destroy(&scratch->data.requests);
+	aml_vector_destroy(&scratch->data.tilemap);
+	pthread_mutex_destroy(&scratch->data.lock);
+	return 0;
+}
diff --git a/src/tiling_nd.c b/src/tiling_nd.c
new file mode 100644
index 00000000..b2051884
--- /dev/null
+++ b/src/tiling_nd.c
@@ -0,0 +1,72 @@
+#include <aml.h>
+
+struct aml_layout *aml_tiling_nd_index(const struct aml_tiling_nd *t, ...)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	va_list ap;
+	struct aml_layout *ret;
+	va_start(ap, t);
+	ret = t->ops->index(t->data, ap);
+        va_end(ap);
+	return ret;
+}
+
+struct aml_layout *aml_tiling_nd_aindex(const struct aml_tiling_nd *t, const size_t *coords)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	return t->ops->aindex(t->data, coords);
+}
+
+int aml_tiling_nd_order(const struct aml_tiling_nd *t)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	return t->ops->order(t->data);
+}
+
+int aml_tiling_nd_tile_dims(const struct aml_tiling_nd *t, ...)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	va_list ap;
+	int ret;
+	va_start(ap, t);
+	ret = t->ops->tile_dims(t->data, ap);
+	va_end(ap);
+	return ret;
+}
+
+int aml_tiling_nd_tile_adims(const struct aml_tiling_nd *t, size_t *dims)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	return t->ops->tile_adims(t->data, dims);
+}
+
+int aml_tiling_nd_dims(const struct aml_tiling_nd *t, ...)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	va_list ap;
+	int ret;
+	va_start(ap, t);
+	ret = t->ops->dims(t->data, ap);
+	va_end(ap);
+	return ret;
+}
+
+int aml_tiling_nd_adims(const struct aml_tiling_nd *t, size_t *dims)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	return t->ops->adims(t->data, dims);
+}
+
+size_t aml_tiling_nd_ndims(const struct aml_tiling_nd *t)
+{
+	assert(t != NULL);
+	assert(t->ops != NULL);
+	return t->ops->ndims(t->data);
+}
diff --git a/src/tiling_nd_collapse.c b/src/tiling_nd_collapse.c
new file mode 100644
index 00000000..d169f0c9
--- /dev/null
+++ b/src/tiling_nd_collapse.c
@@ -0,0 +1,385 @@
+#include <aml.h>
+
+int aml_tiling_nd_collapse_struct_init(struct aml_tiling_nd *t, size_t ndims,
+				     void *memory)
+{
+	struct aml_tiling_nd_data_collapse *dataptr;
+
+	assert(t == (struct aml_tiling_nd *)memory);
+	memory = (void *)((uintptr_t)memory +
+		     sizeof(struct aml_tiling_nd));
+	dataptr = memory;
+	t->data = memory;
+	memory = (void *)((uintptr_t)memory +
+		     sizeof(struct aml_tiling_nd_data_collapse));
+	dataptr->l = NULL;
+	dataptr->ndims = ndims;
+	dataptr->tile_dims = (size_t *)memory;
+	dataptr->dims = dataptr->tile_dims + ndims;
+	dataptr->border_tile_dims = dataptr->dims + ndims;
+	return 0;
+}
+
+int aml_tiling_nd_collapse_ainit(struct aml_tiling_nd *t, uint64_t tags,
+                               const struct aml_layout *l, size_t ndims,
+                               const size_t *tile_dims)
+{
+	assert(t != NULL);
+	assert(t->data != NULL);
+	struct aml_tiling_nd_data_collapse *data =
+	    (struct aml_tiling_nd_data_collapse *)t->data;
+	assert(data->ndims == ndims);
+	assert(data->tile_dims);
+	assert(data->dims);
+	assert(data->border_tile_dims);
+	data->l = l;
+	int type = AML_TYPE_GET(tags, AML_TYPE_TILING_ORDER);
+	if (type == AML_TYPE_TILING_ROW_ORDER) {
+		AML_TYPE_SET(t->tags, AML_TYPE_TILING_ORDER,
+			     AML_TYPE_TILING_ROW_ORDER);
+		t->ops = &aml_tiling_nd_collapse_row_ops;
+		for (size_t i = 0; i < ndims; i++)
+			data->tile_dims[i] = tile_dims[ndims-i-1];
+	} else {
+		AML_TYPE_SET(t->tags, AML_TYPE_TILING_ORDER,
+			     AML_TYPE_TILING_COLUMN_ORDER);
+		t->ops = &aml_tiling_nd_collapse_column_ops;
+		for (size_t i = 0; i < ndims; i++)
+			data->tile_dims[i] = tile_dims[i];
+	}
+	size_t target_dims[ndims];
+	l->ops->adims_column(l->data, target_dims);
+	for (size_t i = 0; i < ndims; i++) {
+		data->border_tile_dims[i] = target_dims[i] % data->tile_dims[i];
+		data->dims[i] = target_dims[i] / data->tile_dims[i];
+		if (data->border_tile_dims[i] == 0)
+			data->border_tile_dims[i] = data->tile_dims[i];
+		else
+			data->dims[i] += 1;
+	}
+	return 0;
+}
+
+int aml_tiling_nd_collapse_vinit(struct aml_tiling_nd *t, uint64_t tags,
+                               const struct aml_layout *l, size_t ndims,
+                               va_list data)
+{
+	size_t tile_dims[ndims];
+	for(size_t i = 0; i < ndims; i++)
+		tile_dims[i] = va_arg(data, size_t);
+	return aml_tiling_nd_collapse_ainit(t, tags, l, ndims, tile_dims);
+}
+
+int aml_tiling_nd_collapse_init(struct aml_tiling_nd *t, uint64_t tags,
+			      const struct aml_layout *l, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, ndims);
+	err = aml_tiling_nd_collapse_vinit(t, tags, l, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_tiling_nd_collapse_acreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 const size_t *tile_dims)
+{
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_TILING_COLLAPSE_ALLOCSIZE(ndims));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_collapse_struct_init(*t, ndims, baseptr);
+	return aml_tiling_nd_collapse_ainit(*t, tags, l, ndims, tile_dims);
+}
+
+int aml_tiling_nd_collapse_vcreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 va_list data)
+{
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_TILING_COLLAPSE_ALLOCSIZE(ndims));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_collapse_struct_init(*t, ndims, baseptr);
+	return aml_tiling_nd_collapse_vinit(*t, tags, l, ndims, data);
+}
+
+int aml_tiling_nd_collapse_create(struct aml_tiling_nd **t, uint64_t tags,
+				const struct aml_layout *l, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_TILING_COLLAPSE_ALLOCSIZE(ndims));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_collapse_struct_init(*t, ndims, baseptr);
+	va_start(ap, ndims);
+	err = aml_tiling_nd_collapse_vinit(*t, tags, l, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+/*----------------------------------------------------------------------------*/
+
+struct aml_layout*
+aml_tiling_nd_collapse_column_aindex(const struct aml_tiling_nd_data *l,
+				   const size_t *coords)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	size_t ndims = d->ndims;
+	size_t new_coords[ndims];
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+	for(size_t i = 0, j = 0; i < ndims; i++)
+		if (d->dims[i] > 1) {
+			assert(coords[j] < d->dims[i]);
+			new_coords[i] = coords[j];
+			j++;
+		} else
+			new_coords[i] = 0;
+	for(size_t i = 0; i < ndims; i++) {
+		offsets[i] = new_coords[i] * d->tile_dims[i];
+		strides[i] = 1;
+	}
+	for(size_t i = 0; i < ndims; i++)
+		dims[i] = (new_coords[i] == d->dims[i] - 1 ?
+			      d->border_tile_dims[i] :
+			      d->tile_dims[i] );
+	return d->l->ops->aslice_column(d->l->data, offsets, dims, strides);
+}
+
+struct aml_layout*
+aml_tiling_nd_collapse_column_index(const struct aml_tiling_nd_data *l,
+				  va_list coords)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	size_t n_coords[d->ndims];
+	for(size_t i = 0, j = 0; i < d->ndims; i++)
+		if (d->dims[i] > 1)
+			n_coords[j++] = va_arg(coords, size_t);
+	return aml_tiling_nd_collapse_column_aindex(l, n_coords);
+}
+
+int
+aml_tiling_nd_collapse_column_order(const struct aml_tiling_nd_data * l)
+{
+	return AML_TYPE_TILING_COLUMN_ORDER;
+}
+
+int
+aml_tiling_nd_collapse_column_tile_dims(const struct aml_tiling_nd_data *l,
+				      va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->tile_dims[i];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_collapse_column_tile_adims(const struct aml_tiling_nd_data *l,
+				       size_t *tile_dims)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	memcpy((void*)tile_dims, (void*)d->tile_dims, sizeof(size_t)*d->ndims);
+	return 0;	
+}
+
+int
+aml_tiling_nd_collapse_column_dims(const struct aml_tiling_nd_data *l,
+				 va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		if (d->dims[i] > 1) {
+			size_t *dim = va_arg(dims_ptrs, size_t*);
+			assert(dim != NULL);
+			*dim = d->dims[i];
+		}
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_collapse_column_adims(const struct aml_tiling_nd_data *l,
+				  size_t *dims)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	for(size_t i = 0, j = 0; i < d->ndims; i++)
+		if (d->dims[i] > 1)
+			dims[j++] = d->dims[i];
+	return 0;	
+}
+
+size_t
+aml_tiling_nd_collapse_column_ndims(const struct aml_tiling_nd_data *l)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	size_t ndims = 0;
+	for(size_t i = 0; i < d->ndims; i++)
+		if (d->dims[i] > 1)
+			ndims++;
+	return ndims;
+}
+
+struct aml_tiling_nd_ops aml_tiling_nd_collapse_column_ops = {
+	aml_tiling_nd_collapse_column_index,
+	aml_tiling_nd_collapse_column_aindex,
+	aml_tiling_nd_collapse_column_order,
+	aml_tiling_nd_collapse_column_tile_dims,
+	aml_tiling_nd_collapse_column_tile_adims,
+	aml_tiling_nd_collapse_column_dims,
+	aml_tiling_nd_collapse_column_adims,
+	aml_tiling_nd_collapse_column_ndims
+};
+
+/*----------------------------------------------------------------------------*/
+
+struct aml_layout*
+aml_tiling_nd_collapse_row_aindex(const struct aml_tiling_nd_data *l,
+				   const size_t *coords)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	size_t ndims = d->ndims;
+	size_t new_coords[ndims];
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+
+	for(size_t i = 0, j = 0; i < ndims; i++)
+		if (d->dims[ndims - i - 1] > 1) {
+			assert(coords[j] < d->dims[ndims - i - 1]);
+			new_coords[ndims - i - 1] = coords[j];
+			j++;
+		} else
+			new_coords[ndims - i - 1] = 0;
+	for(size_t i = 0; i < ndims; i++) {
+		
+		offsets[i] = new_coords[i] * d->tile_dims[i];
+		strides[i] = 1;
+	}
+	for(size_t i = 0; i < ndims; i++)
+		dims[i] = (new_coords[i] == d->dims[i] - 1 ?
+			      d->border_tile_dims[i] :
+			      d->tile_dims[i] );
+	return d->l->ops->aslice_column(d->l->data, offsets, dims, strides);
+}
+
+struct aml_layout*
+aml_tiling_nd_collapse_row_index(const struct aml_tiling_nd_data *l,
+				  va_list coords)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	size_t n_coords[d->ndims];
+	for(size_t i = 0, j = 0; i < d->ndims; i++)
+		if (d->dims[i] > 1)
+			n_coords[j++] = va_arg(coords, size_t);
+	return aml_tiling_nd_collapse_row_aindex(l, n_coords);
+}
+
+int
+aml_tiling_nd_collapse_row_order(const struct aml_tiling_nd_data * l)
+{
+	return AML_TYPE_TILING_ROW_ORDER;
+}
+
+int
+aml_tiling_nd_collapse_row_tile_dims(const struct aml_tiling_nd_data *l,
+				      va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->tile_dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_collapse_row_tile_adims(const struct aml_tiling_nd_data *l,
+				       size_t *tile_dims)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		tile_dims[i] = d->tile_dims[d->ndims - i - 1];
+	}
+	return 0;	
+}
+
+int
+aml_tiling_nd_collapse_row_dims(const struct aml_tiling_nd_data *l,
+				 va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		if (d->dims[d->ndims - i - 1] > 1) {
+			size_t *dim = va_arg(dims_ptrs, size_t*);
+			assert(dim != NULL);
+			*dim = d->dims[d->ndims - i - 1];
+		}
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_collapse_row_adims(const struct aml_tiling_nd_data *l,
+				  size_t *dims)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	for(size_t i = 0, j = 0; i < d->ndims; i++)
+		if (d->dims[d->ndims - i - 1] > 1)
+			dims[j++] = d->dims[d->ndims - i - 1];
+	return 0;	
+}
+
+size_t
+aml_tiling_nd_collapse_row_ndims(const struct aml_tiling_nd_data *l)
+{
+	const struct aml_tiling_nd_data_collapse *d =
+	    (const struct aml_tiling_nd_data_collapse *)l;
+	assert(d != NULL);
+	size_t ndims = 0;
+	for(size_t i = 0; i < d->ndims; i++)
+		if (d->dims[i] > 1)
+			ndims++;
+	return ndims;
+}
+
+struct aml_tiling_nd_ops aml_tiling_nd_collapse_row_ops = {
+	aml_tiling_nd_collapse_row_index,
+	aml_tiling_nd_collapse_row_aindex,
+	aml_tiling_nd_collapse_row_order,
+	aml_tiling_nd_collapse_row_tile_dims,
+	aml_tiling_nd_collapse_row_tile_adims,
+	aml_tiling_nd_collapse_row_dims,
+	aml_tiling_nd_collapse_row_adims,
+	aml_tiling_nd_collapse_row_ndims
+};
diff --git a/src/tiling_nd_pad.c b/src/tiling_nd_pad.c
new file mode 100644
index 00000000..8445ddc2
--- /dev/null
+++ b/src/tiling_nd_pad.c
@@ -0,0 +1,426 @@
+#include <aml.h>
+
+int aml_tiling_nd_pad_struct_init(struct aml_tiling_nd *t, size_t ndims,
+				     void *memory)
+{
+	struct aml_tiling_nd_data_pad *dataptr;
+
+	assert(t == (struct aml_tiling_nd *)memory);
+	memory = (void *)((uintptr_t)memory +
+		     sizeof(struct aml_tiling_nd));
+	dataptr = memory;
+	t->data = memory;
+	memory = (void *)((uintptr_t)memory +
+		     sizeof(struct aml_tiling_nd_data_pad));
+	dataptr->l = NULL;
+	dataptr->ndims = ndims;
+	dataptr->tile_dims = (size_t *)memory;
+	dataptr->dims = dataptr->tile_dims + ndims;
+	dataptr->border_tile_dims = dataptr->dims + ndims;
+	dataptr->pad = dataptr->border_tile_dims + ndims;
+	dataptr->neutral = (void *)(dataptr->pad + ndims);
+	return 0;
+}
+
+int aml_tiling_nd_pad_ainit(struct aml_tiling_nd *t, uint64_t tags,
+                               const struct aml_layout *l, size_t ndims,
+                               const size_t *tile_dims, void *neutral)
+{
+	assert(t != NULL);
+	assert(t->data != NULL);
+	struct aml_tiling_nd_data_pad *data =
+	    (struct aml_tiling_nd_data_pad *)t->data;
+	size_t element_size = aml_layout_element_size(l);
+	assert(data->ndims == ndims);
+	assert(data->tile_dims);
+	assert(data->dims);
+	assert(data->border_tile_dims);
+	assert(data->pad);
+	assert(data->neutral);
+	data->l = l;
+	int type = AML_TYPE_GET(tags, AML_TYPE_TILING_ORDER);
+	if (type == AML_TYPE_TILING_ROW_ORDER) {
+		AML_TYPE_SET(t->tags, AML_TYPE_TILING_ORDER,
+			     AML_TYPE_TILING_ROW_ORDER);
+		t->ops = &aml_tiling_nd_pad_row_ops;
+		for (size_t i = 0; i < ndims; i++)
+			data->tile_dims[i] = tile_dims[ndims-i-1];
+	} else {
+		AML_TYPE_SET(t->tags, AML_TYPE_TILING_ORDER,
+			     AML_TYPE_TILING_COLUMN_ORDER);
+		t->ops = &aml_tiling_nd_pad_column_ops;
+		for (size_t i = 0; i < ndims; i++)
+			data->tile_dims[i] = tile_dims[i];
+	}
+	size_t target_dims[ndims];
+	l->ops->adims_column(l->data, target_dims);
+	for (size_t i = 0; i < ndims; i++) {
+		data->border_tile_dims[i] = target_dims[i] % data->tile_dims[i];
+		data->dims[i] = target_dims[i] / data->tile_dims[i];
+		if (data->border_tile_dims[i] == 0)
+			data->border_tile_dims[i] = data->tile_dims[i];
+		else {
+			data->dims[i] += 1;
+			data->pad[i] = 1;
+		}
+	}
+	memcpy(data->neutral, neutral, element_size);
+	return 0;
+}
+
+int aml_tiling_nd_pad_vinit(struct aml_tiling_nd *t, uint64_t tags,
+                               const struct aml_layout *l, size_t ndims,
+                               va_list data)
+{
+	size_t tile_dims[ndims];
+	void *neutral;
+	for(size_t i = 0; i < ndims; i++)
+		tile_dims[i] = va_arg(data, size_t);
+	neutral = va_arg(data, void*);
+	return aml_tiling_nd_pad_ainit(t, tags, l, ndims, tile_dims, neutral);
+}
+
+int aml_tiling_nd_pad_init(struct aml_tiling_nd *t, uint64_t tags,
+			      const struct aml_layout *l, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, ndims);
+	err = aml_tiling_nd_pad_vinit(t, tags, l, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_tiling_nd_pad_acreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 const size_t *tile_dims, void *neutral)
+{
+	assert(ndims > 0);
+	size_t element_size = aml_layout_element_size(l);
+	void *baseptr = calloc(1, AML_TILING_PAD_ALLOCSIZE(ndims,
+							   element_size));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_pad_struct_init(*t, ndims, baseptr);
+	return aml_tiling_nd_pad_ainit(*t, tags, l, ndims, tile_dims, neutral);
+}
+
+int aml_tiling_nd_pad_vcreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 va_list data)
+{
+	assert(ndims > 0);
+	size_t element_size = aml_layout_element_size(l);
+	void *baseptr = calloc(1, AML_TILING_PAD_ALLOCSIZE(ndims,
+							   element_size));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_pad_struct_init(*t, ndims, baseptr);
+	return aml_tiling_nd_pad_vinit(*t, tags, l, ndims, data);
+}
+
+int aml_tiling_nd_pad_create(struct aml_tiling_nd **t, uint64_t tags,
+				const struct aml_layout *l, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	assert(ndims > 0);
+	size_t element_size = aml_layout_element_size(l);
+	void *baseptr = calloc(1, AML_TILING_PAD_ALLOCSIZE(ndims,
+							   element_size));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_pad_struct_init(*t, ndims, baseptr);
+	va_start(ap, ndims);
+	err = aml_tiling_nd_pad_vinit(*t, tags, l, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+/*----------------------------------------------------------------------------*/
+
+struct aml_layout*
+aml_tiling_nd_pad_column_aindex(const struct aml_tiling_nd_data *l,
+				   const size_t *coords)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	size_t ndims = d->ndims;
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+	for (size_t i = 0; i < ndims; i++)
+		assert(coords[i] < d->dims[i]);
+	for (size_t i = 0; i < ndims; i++) {
+		offsets[i] = coords[i] * d->tile_dims[i];
+		strides[i] = 1;
+	}
+
+	int pad = 0;
+	for (size_t i = 0; i < ndims; i++) {
+		if (coords[i] == d->dims[i] - 1) {
+			dims[i] = d->border_tile_dims[i];
+			if (d->pad[i])
+				pad = 1;
+		} else
+			dims[i] = d->tile_dims[i];
+	}
+	struct aml_layout *res = d->l->ops->aslice_column(d->l->data, offsets,
+							  dims, strides);
+	if (pad) {
+		struct aml_layout *p_layout;
+		int order = aml_layout_order(d->l);
+		if (order == AML_TYPE_LAYOUT_COLUMN_ORDER) {
+			/* WARNING: OWNERSHIP!!! */
+			aml_layout_pad_acreate(&p_layout,
+					       AML_TYPE_LAYOUT_COLUMN_ORDER,
+					       res, d->tile_dims, d->neutral);
+		} else {
+			size_t row_dims[ndims];
+			for (size_t i = 0; i < ndims; i++)
+				row_dims[i] = d->tile_dims[i];
+			/* WARNING: OWNERSHIP!!! */
+			aml_layout_pad_acreate(&p_layout,
+					       AML_TYPE_LAYOUT_ROW_ORDER,
+					       res, row_dims, d->neutral);
+		}
+		return p_layout;
+	} else
+		return res;
+}
+
+struct aml_layout*
+aml_tiling_nd_pad_column_index(const struct aml_tiling_nd_data *l,
+				  va_list coords)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	size_t n_coords[d->ndims];
+	for (size_t i = 0; i < d->ndims; i++)
+		n_coords[i] = va_arg(coords, size_t);
+	return aml_tiling_nd_pad_column_aindex(l, n_coords);
+}
+
+int
+aml_tiling_nd_pad_column_order(const struct aml_tiling_nd_data * l)
+{
+	return AML_TYPE_TILING_COLUMN_ORDER;
+}
+
+int
+aml_tiling_nd_pad_column_tile_dims(const struct aml_tiling_nd_data *l,
+				      va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->tile_dims[i];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_pad_column_tile_adims(const struct aml_tiling_nd_data *l,
+				       size_t *tile_dims)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	memcpy((void*)tile_dims, (void*)d->tile_dims, sizeof(size_t)*d->ndims);
+	return 0;	
+}
+
+int
+aml_tiling_nd_pad_column_dims(const struct aml_tiling_nd_data *l,
+				 va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[i];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_pad_column_adims(const struct aml_tiling_nd_data *l,
+				  size_t *dims)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	memcpy((void*)dims, (void*)d->dims, sizeof(size_t)*d->ndims);
+	return 0;	
+}
+
+size_t
+aml_tiling_nd_pad_column_ndims(const struct aml_tiling_nd_data *l)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	return d->ndims;
+}
+
+struct aml_tiling_nd_ops aml_tiling_nd_pad_column_ops = {
+	aml_tiling_nd_pad_column_index,
+	aml_tiling_nd_pad_column_aindex,
+	aml_tiling_nd_pad_column_order,
+	aml_tiling_nd_pad_column_tile_dims,
+	aml_tiling_nd_pad_column_tile_adims,
+	aml_tiling_nd_pad_column_dims,
+	aml_tiling_nd_pad_column_adims,
+	aml_tiling_nd_pad_column_ndims
+};
+
+/*----------------------------------------------------------------------------*/
+
+struct aml_layout*
+aml_tiling_nd_pad_row_aindex(const struct aml_tiling_nd_data *l,
+				   const size_t *coords)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	size_t ndims = d->ndims;
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+
+	for(size_t i = 0; i < ndims; i++)
+		assert(coords[ndims - i - 1] < d->dims[i]);
+	for(size_t i = 0; i < ndims; i++) {
+		offsets[i] = coords[ndims - i - 1] * d->tile_dims[i];
+		strides[i] = 1;
+	}
+
+	int pad = 0;
+	for (size_t i = 0; i < ndims; i++) {
+		if (coords[ndims - i - 1] == d->dims[i] - 1) {
+			dims[i] = d->border_tile_dims[i];
+			if (d->pad[i])
+				pad = 1;
+		} else
+			dims[i] = d->tile_dims[i];
+	}
+	struct aml_layout *res = d->l->ops->aslice_column(d->l->data, offsets,
+							  dims, strides);
+	if (pad) {
+		struct aml_layout *p_layout;
+		int order = aml_layout_order(d->l);
+		if (order == AML_TYPE_LAYOUT_COLUMN_ORDER) {
+			/* WARNING: OWNERSHIP!!! */
+			aml_layout_pad_acreate(&p_layout,
+					       AML_TYPE_LAYOUT_COLUMN_ORDER,
+					       res, d->tile_dims, d->neutral);
+		} else {
+			size_t row_dims[ndims];
+			for (size_t i = 0; i < ndims; i++)
+				row_dims[i] = d->tile_dims[ndims - i - 1];
+			/* WARNING: OWNERSHIP!!! */
+			aml_layout_pad_acreate(&p_layout,
+					       AML_TYPE_LAYOUT_ROW_ORDER,
+					       res, row_dims, d->neutral);
+		}
+		return p_layout;
+	} else
+		return res;
+}
+
+struct aml_layout*
+aml_tiling_nd_pad_row_index(const struct aml_tiling_nd_data *l,
+				  va_list coords)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	size_t n_coords[d->ndims];
+	for(size_t i = 0; i < d->ndims; i++)
+		n_coords[i] = va_arg(coords, size_t);
+	return aml_tiling_nd_pad_row_aindex(l, n_coords);
+}
+
+int
+aml_tiling_nd_pad_row_order(const struct aml_tiling_nd_data * l)
+{
+	return AML_TYPE_TILING_ROW_ORDER;
+}
+
+int
+aml_tiling_nd_pad_row_tile_dims(const struct aml_tiling_nd_data *l,
+				      va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->tile_dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_pad_row_tile_adims(const struct aml_tiling_nd_data *l,
+				       size_t *tile_dims)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		tile_dims[i] = d->tile_dims[d->ndims - i - 1];
+	}
+	return 0;	
+}
+
+int
+aml_tiling_nd_pad_row_dims(const struct aml_tiling_nd_data *l,
+				 va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_pad_row_adims(const struct aml_tiling_nd_data *l,
+				  size_t *dims)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		dims[i] = d->dims[d->ndims - i - 1];
+	}
+	return 0;	
+}
+
+size_t
+aml_tiling_nd_pad_row_ndims(const struct aml_tiling_nd_data *l)
+{
+	const struct aml_tiling_nd_data_pad *d =
+	    (const struct aml_tiling_nd_data_pad *)l;
+	assert(d != NULL);
+	return d->ndims;
+}
+
+struct aml_tiling_nd_ops aml_tiling_nd_pad_row_ops = {
+	aml_tiling_nd_pad_row_index,
+	aml_tiling_nd_pad_row_aindex,
+	aml_tiling_nd_pad_row_order,
+	aml_tiling_nd_pad_row_tile_dims,
+	aml_tiling_nd_pad_row_tile_adims,
+	aml_tiling_nd_pad_row_dims,
+	aml_tiling_nd_pad_row_adims,
+	aml_tiling_nd_pad_row_ndims
+};
diff --git a/src/tiling_nd_resize.c b/src/tiling_nd_resize.c
new file mode 100644
index 00000000..0352a737
--- /dev/null
+++ b/src/tiling_nd_resize.c
@@ -0,0 +1,356 @@
+#include <aml.h>
+
+int aml_tiling_nd_resize_struct_init(struct aml_tiling_nd *t, size_t ndims,
+				     void *memory)
+{
+	struct aml_tiling_nd_data_resize *dataptr;
+
+	assert(t == (struct aml_tiling_nd *)memory);
+	memory = (void *)((uintptr_t)memory +
+		     sizeof(struct aml_tiling_nd));
+	dataptr = memory;
+	t->data = memory;
+	memory = (void *)((uintptr_t)memory +
+		     sizeof(struct aml_tiling_nd_data_resize));
+	dataptr->l = NULL;
+	dataptr->ndims = ndims;
+	dataptr->tile_dims = (size_t *)memory;
+	dataptr->dims = dataptr->tile_dims + ndims;
+	dataptr->border_tile_dims = dataptr->dims + ndims;
+	return 0;
+}
+
+int aml_tiling_nd_resize_ainit(struct aml_tiling_nd *t, uint64_t tags,
+                               const struct aml_layout *l, size_t ndims,
+                               const size_t *tile_dims)
+{
+	assert(t != NULL);
+	assert(t->data != NULL);
+	struct aml_tiling_nd_data_resize *data =
+	    (struct aml_tiling_nd_data_resize *)t->data;
+	assert(data->ndims == ndims);
+	assert(data->tile_dims);
+	assert(data->dims);
+	assert(data->border_tile_dims);
+	data->l = l;
+	int type = AML_TYPE_GET(tags, AML_TYPE_TILING_ORDER);
+	if (type == AML_TYPE_TILING_ROW_ORDER) {
+		AML_TYPE_SET(t->tags, AML_TYPE_TILING_ORDER,
+			     AML_TYPE_TILING_ROW_ORDER);
+		t->ops = &aml_tiling_nd_resize_row_ops;
+		for (size_t i = 0; i < ndims; i++)
+			data->tile_dims[i] = tile_dims[ndims-i-1];
+	} else {
+		AML_TYPE_SET(t->tags, AML_TYPE_TILING_ORDER,
+			     AML_TYPE_TILING_COLUMN_ORDER);
+		t->ops = &aml_tiling_nd_resize_column_ops;
+		for (size_t i = 0; i < ndims; i++)
+			data->tile_dims[i] = tile_dims[i];
+	}
+	size_t target_dims[ndims];
+	l->ops->adims_column(l->data, target_dims);
+	for (size_t i = 0; i < ndims; i++) {
+		data->border_tile_dims[i] = target_dims[i] % data->tile_dims[i];
+		data->dims[i] = target_dims[i] / data->tile_dims[i];
+		if (data->border_tile_dims[i] == 0)
+			data->border_tile_dims[i] = data->tile_dims[i];
+		else
+			data->dims[i] += 1;
+	}
+	return 0;
+}
+
+int aml_tiling_nd_resize_vinit(struct aml_tiling_nd *t, uint64_t tags,
+                               const struct aml_layout *l, size_t ndims,
+                               va_list data)
+{
+	size_t tile_dims[ndims];
+	for(size_t i = 0; i < ndims; i++)
+		tile_dims[i] = va_arg(data, size_t);
+	return aml_tiling_nd_resize_ainit(t, tags, l, ndims, tile_dims);
+}
+
+int aml_tiling_nd_resize_init(struct aml_tiling_nd *t, uint64_t tags,
+			      const struct aml_layout *l, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	va_start(ap, ndims);
+	err = aml_tiling_nd_resize_vinit(t, tags, l, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+int aml_tiling_nd_resize_acreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 const size_t *tile_dims)
+{
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_TILING_RESIZE_ALLOCSIZE(ndims));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_resize_struct_init(*t, ndims, baseptr);
+	return aml_tiling_nd_resize_ainit(*t, tags, l, ndims, tile_dims);
+}
+
+int aml_tiling_nd_resize_vcreate(struct aml_tiling_nd **t, uint64_t tags,
+				 const struct aml_layout *l, size_t ndims,
+				 va_list data)
+{
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_TILING_RESIZE_ALLOCSIZE(ndims));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_resize_struct_init(*t, ndims, baseptr);
+	return aml_tiling_nd_resize_vinit(*t, tags, l, ndims, data);
+}
+
+int aml_tiling_nd_resize_create(struct aml_tiling_nd **t, uint64_t tags,
+				const struct aml_layout *l, size_t ndims, ...)
+{
+	int err;
+	va_list ap;
+	assert(ndims > 0);
+	void *baseptr = calloc(1, AML_TILING_RESIZE_ALLOCSIZE(ndims));
+	*t = (struct aml_tiling_nd *)baseptr;
+	aml_tiling_nd_resize_struct_init(*t, ndims, baseptr);
+	va_start(ap, ndims);
+	err = aml_tiling_nd_resize_vinit(*t, tags, l, ndims, ap);
+	va_end(ap);
+	return err;
+}
+
+/*----------------------------------------------------------------------------*/
+
+struct aml_layout*
+aml_tiling_nd_resize_column_aindex(const struct aml_tiling_nd_data *l,
+				   const size_t *coords)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	size_t ndims = d->ndims;
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+	for(size_t i = 0; i < ndims; i++)
+		assert(coords[i] < d->dims[i]);
+	for(size_t i = 0; i < ndims; i++) {
+		offsets[i] = coords[i] * d->tile_dims[i];
+		strides[i] = 1;
+	}
+	for(size_t i = 0; i < ndims; i++)
+		dims[i] = (coords[i] == d->dims[i] - 1 ?
+			      d->border_tile_dims[i] :
+			      d->tile_dims[i] );
+	return d->l->ops->aslice_column(d->l->data, offsets, dims, strides);
+}
+
+struct aml_layout*
+aml_tiling_nd_resize_column_index(const struct aml_tiling_nd_data *l,
+				  va_list coords)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	size_t n_coords[d->ndims];
+	for(size_t i = 0; i < d->ndims; i++)
+		n_coords[i] = va_arg(coords, size_t);
+	return aml_tiling_nd_resize_column_aindex(l, n_coords);
+}
+
+int
+aml_tiling_nd_resize_column_order(const struct aml_tiling_nd_data * l)
+{
+	return AML_TYPE_TILING_COLUMN_ORDER;
+}
+
+int
+aml_tiling_nd_resize_column_tile_dims(const struct aml_tiling_nd_data *l,
+				      va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->tile_dims[i];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_resize_column_tile_adims(const struct aml_tiling_nd_data *l,
+				       size_t *tile_dims)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	memcpy((void*)tile_dims, (void*)d->tile_dims, sizeof(size_t)*d->ndims);
+	return 0;	
+}
+
+int
+aml_tiling_nd_resize_column_dims(const struct aml_tiling_nd_data *l,
+				 va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[i];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_resize_column_adims(const struct aml_tiling_nd_data *l,
+				  size_t *dims)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	memcpy((void*)dims, (void*)d->dims, sizeof(size_t)*d->ndims);
+	return 0;	
+}
+
+size_t
+aml_tiling_nd_resize_column_ndims(const struct aml_tiling_nd_data *l)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	return d->ndims;
+}
+
+struct aml_tiling_nd_ops aml_tiling_nd_resize_column_ops = {
+	aml_tiling_nd_resize_column_index,
+	aml_tiling_nd_resize_column_aindex,
+	aml_tiling_nd_resize_column_order,
+	aml_tiling_nd_resize_column_tile_dims,
+	aml_tiling_nd_resize_column_tile_adims,
+	aml_tiling_nd_resize_column_dims,
+	aml_tiling_nd_resize_column_adims,
+	aml_tiling_nd_resize_column_ndims
+};
+
+/*----------------------------------------------------------------------------*/
+
+struct aml_layout*
+aml_tiling_nd_resize_row_aindex(const struct aml_tiling_nd_data *l,
+				   const size_t *coords)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	size_t ndims = d->ndims;
+	size_t offsets[ndims];
+	size_t dims[ndims];
+	size_t strides[ndims];
+
+	for(size_t i = 0; i < ndims; i++)
+		assert(coords[ndims - i - 1] < d->dims[i]);
+	for(size_t i = 0; i < ndims; i++) {
+		offsets[i] = coords[ndims - i - 1] * d->tile_dims[i];
+		strides[i] = 1;
+	}
+	for(size_t i = 0; i < ndims; i++)
+		dims[i] = (coords[ndims - i - 1] == d->dims[i] - 1 ?
+			      d->border_tile_dims[i] :
+			      d->tile_dims[i] );
+	return d->l->ops->aslice_column(d->l->data, offsets, dims, strides);
+}
+
+struct aml_layout*
+aml_tiling_nd_resize_row_index(const struct aml_tiling_nd_data *l,
+				  va_list coords)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	size_t n_coords[d->ndims];
+	for(size_t i = 0; i < d->ndims; i++)
+		n_coords[i] = va_arg(coords, size_t);
+	return aml_tiling_nd_resize_row_aindex(l, n_coords);
+}
+
+int
+aml_tiling_nd_resize_row_order(const struct aml_tiling_nd_data * l)
+{
+	return AML_TYPE_TILING_ROW_ORDER;
+}
+
+int
+aml_tiling_nd_resize_row_tile_dims(const struct aml_tiling_nd_data *l,
+				      va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->tile_dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_resize_row_tile_adims(const struct aml_tiling_nd_data *l,
+				       size_t *tile_dims)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		tile_dims[i] = d->tile_dims[d->ndims - i - 1];
+	}
+	return 0;	
+}
+
+int
+aml_tiling_nd_resize_row_dims(const struct aml_tiling_nd_data *l,
+				 va_list dims_ptrs)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		size_t *dim = va_arg(dims_ptrs, size_t*);
+		assert(dim != NULL);
+		*dim = d->dims[d->ndims - i - 1];
+	}
+	return 0;
+}
+
+int
+aml_tiling_nd_resize_row_adims(const struct aml_tiling_nd_data *l,
+				  size_t *dims)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	for(size_t i = 0; i < d->ndims; i++) {
+		dims[i] = d->dims[d->ndims - i - 1];
+	}
+	return 0;	
+}
+
+size_t
+aml_tiling_nd_resize_row_ndims(const struct aml_tiling_nd_data *l)
+{
+	const struct aml_tiling_nd_data_resize *d =
+	    (const struct aml_tiling_nd_data_resize *)l;
+	assert(d != NULL);
+	return d->ndims;
+}
+
+struct aml_tiling_nd_ops aml_tiling_nd_resize_row_ops = {
+	aml_tiling_nd_resize_row_index,
+	aml_tiling_nd_resize_row_aindex,
+	aml_tiling_nd_resize_row_order,
+	aml_tiling_nd_resize_row_tile_dims,
+	aml_tiling_nd_resize_row_tile_adims,
+	aml_tiling_nd_resize_row_dims,
+	aml_tiling_nd_resize_row_adims,
+	aml_tiling_nd_resize_row_ndims
+};
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 7a053236..35ccff90 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -39,7 +39,8 @@ UNIT_TESTS = $(ARENA_JEMALLOC_TESTS) \
 	     $(AREA_LINUX_TESTS) \
 	     $(AREA_POSIX_TESTS) \
 	     $(DMA_LINUX_TESTS) \
-	     $(SCRATCH_TESTS)
+	     $(SCRATCH_TESTS) \
+	     layout copy tiling_nd dma_layout
 
 # all tests
 TST_PROGS = $(UNIT_TESTS)
diff --git a/tests/copy.c b/tests/copy.c
new file mode 100644
index 00000000..854719fa
--- /dev/null
+++ b/tests/copy.c
@@ -0,0 +1,1178 @@
+#include <aml.h>
+#include <assert.h>
+
+void test_copy_2d(void)
+{
+	size_t elem_number[2] = { 5, 3 };
+	size_t src_pitch[2] = { 10, 6 };
+	size_t dst_pitch[2] = { 5, 3 };
+
+	double src[6][10];
+	double dst[3][5];
+	double dst2[6][10];
+
+	double ref_dst2[6][10];
+	double ref_dst[3][5];
+
+	for (int j = 0; j < 6; j++)
+		for (int i = 0; i < 10; i++) {
+			src[j][i] = (double)(i + j * 10);
+			ref_dst2[j][i] = 0.0;
+			dst2[j][i] = 0.0;
+		}
+	for (int j = 0; j < 3; j++)
+		for (int i = 0; i < 5; i++) {
+			dst[j][i] = 0.0;
+			ref_dst[j][i] = src[j][i];
+			ref_dst2[j][i] = src[j][i];
+		}
+
+	aml_copy_nd(2, dst, dst_pitch, src, src_pitch, elem_number,
+		    sizeof(double));
+	for (int j = 0; j < 3; j++)
+		for (int i = 0; i < 5; i++)
+			assert(ref_dst[j][i] == dst[j][i]);
+
+	aml_copy_nd(2, dst2, src_pitch, dst, dst_pitch, elem_number,
+		    sizeof(double));
+	for (int j = 0; j < 6; j++)
+		for (int i = 0; i < 10; i++)
+			assert(ref_dst2[j][i] == dst2[j][i]);
+
+}
+
+void test_copy_t2d(void)
+{
+	size_t elem_number[2] = { 5, 3 };
+	size_t elem_number2[2] = { 3, 5 };
+	size_t src_pitch[2] = { 10, 6 };
+	size_t dst_pitch[2] = { 3, 5 };
+
+	double src[6][10];
+	double dst[5][3];
+	double dst2[6][10];
+
+	double ref_dst2[6][10];
+	double ref_dst[5][3];
+
+	for (int j = 0; j < 6; j++)
+		for (int i = 0; i < 10; i++) {
+			src[j][i] = (double)(i + j * 10);
+			ref_dst2[j][i] = 0.0;
+			dst2[j][i] = 0.0;
+		}
+	for (int j = 0; j < 3; j++)
+		for (int i = 0; i < 5; i++) {
+			dst[i][j] = 0.0;
+			ref_dst[i][j] = src[j][i];
+			ref_dst2[j][i] = src[j][i];
+		}
+
+	aml_copy_tnd(2, dst, dst_pitch, src, src_pitch, elem_number,
+		     sizeof(double));
+	for (int j = 0; j < 3; j++)
+		for (int i = 0; i < 5; i++)
+			assert(ref_dst[i][j] == dst[i][j]);
+
+	aml_copy_tnd(2, dst2, src_pitch, dst, dst_pitch, elem_number2,
+		     sizeof(double));
+	for (int j = 0; j < 6; j++)
+		for (int i = 0; i < 10; i++)
+			assert(ref_dst2[j][i] == dst2[j][i]);
+
+}
+
+void test_copy_3d(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t src_pitch[3] = { 10, 6, 4 };
+	size_t dst_pitch[3] = { 5, 3, 2 };
+
+	double src[4][6][10];
+	double dst[2][3][5];
+	double dst2[4][6][10];
+
+	double ref_dst2[4][6][10];
+	double ref_dst[2][3][5];
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst2[k][j][i] = 0.0;
+				dst2[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[k][j][i] = 0.0;
+				ref_dst[k][j][i] = src[k][j][i];
+				ref_dst2[k][j][i] = src[k][j][i];
+			}
+
+	aml_copy_nd(3, dst, dst_pitch, src, src_pitch, elem_number,
+		    sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[k][j][i] == dst[k][j][i]);
+
+	aml_copy_nd(3, dst2, src_pitch, dst, dst_pitch, elem_number,
+		    sizeof(double));
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst2[k][j][i] == dst2[k][j][i]);
+
+}
+
+void test_copy_3d_c(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t c_src_pitch[4] = { 8, 8 * 10, 8 * 10 * 6, 8 * 10 * 6 * 4 };
+	size_t c_dst_pitch[4] = { 8, 8 * 5, 8 * 5 * 3, 8 * 5 * 3 * 2 };
+
+	double src[4][6][10];
+	double dst[2][3][5];
+	double dst2[4][6][10];
+
+	double ref_dst2[4][6][10];
+	double ref_dst[2][3][5];
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst2[k][j][i] = 0.0;
+				dst2[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[k][j][i] = 0.0;
+				ref_dst[k][j][i] = src[k][j][i];
+				ref_dst2[k][j][i] = src[k][j][i];
+			}
+
+	aml_copy_nd_c(3, dst, c_dst_pitch, src, c_src_pitch, elem_number,
+		    sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[k][j][i] == dst[k][j][i]);
+
+	aml_copy_nd_c(3, dst2, c_src_pitch, dst, c_dst_pitch, elem_number,
+		    sizeof(double));
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst2[k][j][i] == dst2[k][j][i]);
+
+}
+
+void test_copy_3dstr(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t src_pitch[3] = { 10, 6, 4 };
+	size_t src_stride[3] = { 2, 2, 2 };
+	size_t dst_pitch[3] = { 5, 3, 2 };
+	size_t dst_stride[3] = { 1, 1, 1 };
+
+	double src[4][6][10];
+	double dst[2][3][5];
+	double dst2[4][6][10];
+
+	double ref_dst2[4][6][10];
+	double ref_dst[2][3][5];
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst2[k][j][i] = 0.0;
+				dst2[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[k][j][i] = 0.0;
+				ref_dst[k][j][i] = src[2 * k][2 * j][2 * i];
+				ref_dst2[2 * k][2 * j][2 * i] =
+				    src[2 * k][2 * j][2 * i];
+			}
+
+	aml_copy_ndstr(3, dst, dst_pitch, dst_stride, src, src_pitch,
+		       src_stride, elem_number, sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[k][j][i] == dst[k][j][i]);
+
+	aml_copy_ndstr(3, dst2, src_pitch, src_stride, dst, dst_pitch,
+		       dst_stride, elem_number, sizeof(double));
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst2[k][j][i] == dst2[k][j][i]);
+}
+
+void test_copy_3dstr_c(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t c_src_pitch[4] = { 8, 8 * 10, 8 * 10 * 6, 8 * 10 * 6 * 4 };
+	size_t src_stride[3] = { 2, 2, 2 };
+	size_t c_dst_pitch[4] = { 8, 8 * 5, 8 * 5 * 3, 8 * 5 * 3 * 2 };
+	size_t dst_stride[3] = { 1, 1, 1 };
+
+	double src[4][6][10];
+	double dst[2][3][5];
+	double dst2[4][6][10];
+
+	double ref_dst2[4][6][10];
+	double ref_dst[2][3][5];
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst2[k][j][i] = 0.0;
+				dst2[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[k][j][i] = 0.0;
+				ref_dst[k][j][i] = src[2 * k][2 * j][2 * i];
+				ref_dst2[2 * k][2 * j][2 * i] =
+				    src[2 * k][2 * j][2 * i];
+			}
+
+	aml_copy_ndstr_c(3, dst, c_dst_pitch, dst_stride, src, c_src_pitch,
+			 src_stride, elem_number, sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[k][j][i] == dst[k][j][i]);
+
+	aml_copy_ndstr_c(3, dst2, c_src_pitch, src_stride, dst, c_dst_pitch,
+		       dst_stride, elem_number, sizeof(double));
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst2[k][j][i] == dst2[k][j][i]);
+}
+
+void test_copy_t3d(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t elem_number2[3] = { 3, 2, 5 };
+	size_t elem_number3[3] = { 2, 5, 3 };
+	size_t src_pitch[3] = { 10, 6, 4 };
+	size_t dst_pitch[3] = { 3, 2, 5 };
+	size_t dst_pitch2[3] = { 2, 5, 3 };
+
+	double src[4][6][10];
+	double dst[5][2][3];
+	double dst2[3][5][2];
+	double dst3[4][6][10];
+
+	double ref_dst[5][2][3];
+	double ref_dst2[3][5][2];
+	double ref_dst3[4][6][10];
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst3[k][j][i] = 0.0;
+				dst3[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[i][k][j] = 0.0;
+				dst2[j][i][k] = 0.0;
+				ref_dst[i][k][j] = src[k][j][i];
+				ref_dst2[j][i][k] = src[k][j][i];
+				ref_dst3[k][j][i] = src[k][j][i];
+			}
+
+	aml_copy_tnd(3, dst, dst_pitch, src, src_pitch, elem_number,
+		     sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[i][k][j] == dst[i][k][j]);
+
+	aml_copy_tnd(3, dst2, dst_pitch2, dst, dst_pitch, elem_number2,
+		     sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst2[j][i][k] == dst2[j][i][k]);
+
+	aml_copy_tnd(3, dst3, src_pitch, dst2, dst_pitch2, elem_number3,
+		     sizeof(double));
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst3[k][j][i] == dst3[k][j][i]);
+}
+
+void test_copy_rt3d(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t elem_number2[3] = { 2, 5, 3 };
+	size_t elem_number3[3] = { 3, 2, 5 };
+	size_t src_pitch[3] = { 10, 6, 4 };
+	size_t dst_pitch[3] = { 2, 5, 3 };
+	size_t dst_pitch2[3] = { 3, 2, 5 };
+
+	double src[4][6][10];
+	double dst[3][5][2];
+	double dst2[5][2][3];
+	double dst3[4][6][10];
+
+	double ref_dst[3][5][2];
+	double ref_dst2[5][2][3];
+	double ref_dst3[4][6][10];
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst3[k][j][i] = 0.0;
+				dst3[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[j][i][k] = 0.0;
+				dst2[i][k][j] = 0.0;
+				ref_dst[j][i][k] = src[k][j][i];
+				ref_dst2[i][k][j] = src[k][j][i];
+				ref_dst3[k][j][i] = src[k][j][i];
+			}
+
+	aml_copy_rtnd(3, dst, dst_pitch, src, src_pitch, elem_number,
+		      sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[j][i][k] == dst[j][i][k]);
+
+	aml_copy_rtnd(3, dst2, dst_pitch2, dst, dst_pitch, elem_number2,
+		      sizeof(double));
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst2[i][k][j] == dst2[i][k][j]);
+
+	aml_copy_rtnd(3, dst3, src_pitch, dst2, dst_pitch2, elem_number3,
+		      sizeof(double));
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst3[k][j][i] == dst3[k][j][i]);
+}
+
+void test_copy_t4d(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 3, 2, 4, 5 };
+	size_t src_pitch[4] = { 10, 6, 4, 8 };
+	size_t dst_pitch[4] = { 3, 2, 4, 5 };
+
+	double src[8][4][6][10];
+	double dst[5][4][2][3];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][2][3];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][k][j] = 0.0;
+					ref_dst[i][l][k][j] = src[l][k][j][i];
+					ref_dst2[l][k][j][i] = src[l][k][j][i];
+				}
+
+	aml_copy_tnd(4, dst, dst_pitch, src, src_pitch, elem_number,
+		     sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][k][j] ==
+					       dst[i][l][k][j]);
+
+	aml_copy_rtnd(4, dst2, src_pitch, dst, dst_pitch, elem_number2,
+		      sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_t4d_c(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 3, 2, 4, 5 };
+	size_t c_src_pitch[5] = { 8, 8 * 10, 8 * 10 * 6, 8 * 10 * 6 * 4,
+				  8 * 10 * 6 * 4 * 8 };
+	size_t c_dst_pitch[5] = { 8, 8 * 3, 8 * 3 * 2, 8 * 3 * 2 * 4,
+				  8 * 3 * 2 * 4 * 5 };
+
+	double src[8][4][6][10];
+	double dst[5][4][2][3];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][2][3];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][k][j] = 0.0;
+					ref_dst[i][l][k][j] = src[l][k][j][i];
+					ref_dst2[l][k][j][i] = src[l][k][j][i];
+				}
+
+	aml_copy_tnd_c(4, dst, c_dst_pitch, src, c_src_pitch, elem_number,
+		     sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][k][j] ==
+					       dst[i][l][k][j]);
+
+	aml_copy_rtnd_c(4, dst2, c_src_pitch, dst, c_dst_pitch, elem_number2,
+		      sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_t4dstr(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 3, 2, 4, 5 };
+	size_t src_pitch[4] = { 10, 6, 4, 8 };
+	size_t src_stride[4] = { 2, 2, 2, 2 };
+	size_t dst_pitch[4] = { 3, 2, 4, 5 };
+	size_t dst_stride[4] = { 1, 1, 1, 1 };
+
+	double src[8][4][6][10];
+	double dst[5][4][2][3];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][2][3];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][k][j] = 0.0;
+					ref_dst[i][l][k][j] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+					ref_dst2[2 * l][2 * k][2 * j][2 * i] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+				}
+
+	aml_copy_tndstr(4, dst, dst_pitch, dst_stride, src, src_pitch,
+			src_stride, elem_number, sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][k][j] ==
+					       dst[i][l][k][j]);
+
+	aml_copy_rtndstr(4, dst2, src_pitch, src_stride, dst, dst_pitch,
+			 dst_stride, elem_number2, sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_t4dstr_c(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 3, 2, 4, 5 };
+	size_t c_src_pitch[5] = { 8, 8 * 10, 8 * 10 * 6, 8 * 10 * 6 * 4,
+				  8 * 10 * 6 * 4 * 8 };
+	size_t src_stride[4] = { 2, 2, 2, 2 };
+	size_t c_dst_pitch[5] = { 8, 8 * 3, 8 * 3 * 2, 8 * 3 * 2 * 4,
+				  8 * 3 * 2 * 4 * 5 };
+	size_t dst_stride[4] = { 1, 1, 1, 1 };
+
+	double src[8][4][6][10];
+	double dst[5][4][2][3];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][2][3];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][k][j] = 0.0;
+					ref_dst[i][l][k][j] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+					ref_dst2[2 * l][2 * k][2 * j][2 * i] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+				}
+
+	aml_copy_tndstr_c(4, dst, c_dst_pitch, dst_stride, src, c_src_pitch,
+			  src_stride, elem_number, sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][k][j] ==
+					       dst[i][l][k][j]);
+
+	aml_copy_rtndstr_c(4, dst2, c_src_pitch, src_stride, dst, c_dst_pitch,
+			   dst_stride, elem_number2, sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_sh4d(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 2, 3, 4, 5 };
+	size_t target_dims[4] = { 2, 1, 3, 0 };
+	size_t target_dims2[4] = { 3, 1, 0, 2 };
+	size_t src_pitch[4] = { 10, 6, 4, 8 };
+	size_t dst_pitch[4] = { 2, 3, 4, 5 };
+
+	double src[8][4][6][10];
+	double dst[5][4][3][2];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][3][2];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][j][k] = 0.0;
+					ref_dst[i][l][j][k] = src[l][k][j][i];
+					ref_dst2[l][k][j][i] = src[l][k][j][i];
+				}
+
+	aml_copy_shnd(4, target_dims, dst, dst_pitch, src, src_pitch,
+		      elem_number, sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][j][k] ==
+					       dst[i][l][j][k]);
+
+	aml_copy_shnd(4, target_dims2, dst2, src_pitch, dst, dst_pitch,
+		      elem_number2, sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_sh4d_c(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 2, 3, 4, 5 };
+	size_t target_dims[4] = { 2, 1, 3, 0 };
+	size_t target_dims2[4] = { 3, 1, 0, 2 };
+	size_t c_src_pitch[5] = { 8, 8 * 10, 8 * 10 * 6, 8 * 10 * 6 * 4,
+				  8 * 10 * 6 * 4 * 8 };
+	size_t c_dst_pitch[5] = { 8, 8 * 2, 8 * 2 * 3, 8 * 2 * 3 * 4,
+				  8 * 2 * 3 * 4 * 5 };
+
+	double src[8][4][6][10];
+	double dst[5][4][3][2];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][3][2];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][j][k] = 0.0;
+					ref_dst[i][l][j][k] = src[l][k][j][i];
+					ref_dst2[l][k][j][i] = src[l][k][j][i];
+				}
+
+	aml_copy_shnd_c(4, target_dims, dst, c_dst_pitch, src, c_src_pitch,
+			elem_number, sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][j][k] ==
+					       dst[i][l][j][k]);
+
+	aml_copy_shnd_c(4, target_dims2, dst2, c_src_pitch, dst, c_dst_pitch,
+			elem_number2, sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_sh4dstr(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 2, 3, 4, 5 };
+	size_t target_dims[4] = { 2, 1, 3, 0 };
+	size_t target_dims2[4] = { 3, 1, 0, 2 };
+	size_t src_pitch[4] = { 10, 6, 4, 8 };
+	size_t src_stride[4] = { 2, 2, 2, 2 };
+	size_t dst_pitch[4] = { 2, 3, 4, 5 };
+	size_t dst_stride[4] = { 1, 1, 1, 1 };
+
+	double src[8][4][6][10];
+	double dst[5][4][3][2];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][3][2];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][j][k] = 0.0;
+					ref_dst[i][l][j][k] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+					ref_dst2[2 * l][2 * k][2 * j][2 * i] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+				}
+
+	aml_copy_shndstr(4, target_dims, dst, dst_pitch, dst_stride, src,
+			 src_pitch, src_stride, elem_number, sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][j][k] ==
+					       dst[i][l][j][k]);
+
+	aml_copy_shndstr(4, target_dims2, dst2, src_pitch, src_stride, dst,
+			 dst_pitch, dst_stride, elem_number2, sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_sh4dstr_c(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 2, 3, 4, 5 };
+	size_t target_dims[4] = { 2, 1, 3, 0 };
+	size_t target_dims2[4] = { 3, 1, 0, 2 };
+	size_t c_src_pitch[5] = { 8, 8 * 10, 8 * 10 * 6, 8 * 10 * 6 * 4,
+				  8 * 10 * 6 * 4 * 8 };
+	size_t src_stride[4] = { 2, 2, 2, 2 };
+	size_t c_dst_pitch[5] = { 8, 8 * 2, 8 * 2 * 3, 8 * 2 * 3 * 4,
+				  8 * 2 * 3 * 4 * 5 };
+	size_t dst_stride[4] = { 1, 1, 1, 1 };
+
+	double src[8][4][6][10];
+	double dst[5][4][3][2];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][3][2];
+	double ref_dst2[8][4][6][10];
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][j][k] = 0.0;
+					ref_dst[i][l][j][k] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+					ref_dst2[2 * l][2 * k][2 * j][2 * i] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+				}
+
+	aml_copy_shndstr_c(4, target_dims, dst, c_dst_pitch, dst_stride, src,
+			   c_src_pitch, src_stride, elem_number,
+			   sizeof(double));
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][j][k] ==
+					       dst[i][l][j][k]);
+
+	aml_copy_shndstr_c(4, target_dims2, dst2, c_src_pitch, src_stride, dst,
+			   c_dst_pitch, dst_stride, elem_number2,
+			   sizeof(double));
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_copy_layout(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t c_src_pitch[3] = { 10, 6, 4 };
+	size_t src_stride[3] = { 1, 1, 1};
+	size_t c_dst_pitch[3] = { 5, 3, 2 };
+	size_t dst_stride[3] = { 1, 1, 1};
+
+	double src[4][6][10];
+	double dst[2][3][5];
+	double dst2[4][6][10];
+
+	double ref_dst2[4][6][10];
+	double ref_dst[2][3][5];
+
+	AML_LAYOUT_NATIVE_DECL(src_layout, 3);
+	AML_LAYOUT_NATIVE_DECL(dst_layout, 3);
+	AML_LAYOUT_NATIVE_DECL(dst2_layout, 3);
+
+	aml_layout_native_ainit(&src_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)src, sizeof(double), 3, elem_number,
+				src_stride, c_src_pitch);
+	aml_layout_native_ainit(&dst_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst, sizeof(double), 3, elem_number,
+				dst_stride, c_dst_pitch);
+	aml_layout_native_ainit(&dst2_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst2, sizeof(double), 3, elem_number,
+				src_stride, c_src_pitch);
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst2[k][j][i] = 0.0;
+				dst2[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[k][j][i] = 0.0;
+				ref_dst[k][j][i] = src[k][j][i];
+				ref_dst2[k][j][i] = src[k][j][i];
+			}
+
+	aml_copy_layout_native(&dst_layout, &src_layout);
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[k][j][i] == dst[k][j][i]);
+
+	aml_copy_layout_native(&dst2_layout, &dst_layout);
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst2[k][j][i] == dst2[k][j][i]);
+
+}
+
+void test_copy_layout_generic(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t c_src_pitch[3] = { 10, 6, 4 };
+	size_t src_stride[3] = { 1, 1, 1};
+	size_t c_dst_pitch[3] = { 5, 3, 2 };
+	size_t dst_stride[3] = { 1, 1, 1};
+
+	double src[4][6][10];
+	double dst[2][3][5];
+	double dst2[4][6][10];
+
+	double ref_dst2[4][6][10];
+	double ref_dst[2][3][5];
+
+	AML_LAYOUT_NATIVE_DECL(src_layout, 3);
+	AML_LAYOUT_NATIVE_DECL(dst_layout, 3);
+	AML_LAYOUT_NATIVE_DECL(dst2_layout, 3);
+
+	aml_layout_native_ainit(&src_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)src, sizeof(double), 3, elem_number,
+				src_stride, c_src_pitch);
+	aml_layout_native_ainit(&dst_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst, sizeof(double), 3, elem_number,
+				dst_stride, c_dst_pitch);
+	aml_layout_native_ainit(&dst2_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst2, sizeof(double), 3, elem_number,
+				src_stride, c_src_pitch);
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				ref_dst2[k][j][i] = 0.0;
+				dst2[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[k][j][i] = 0.0;
+				ref_dst[k][j][i] = src[k][j][i];
+				ref_dst2[k][j][i] = src[k][j][i];
+			}
+
+	aml_copy_layout_generic(&dst_layout, &src_layout);
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[k][j][i] == dst[k][j][i]);
+
+	aml_copy_layout_generic(&dst2_layout, &dst_layout);
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(ref_dst2[k][j][i] == dst2[k][j][i]);
+
+}
+
+void test_copy_layout_pad_generic(void)
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t src_pitch[3] = { 10, 6, 4 };
+	size_t src_stride[3] = { 1, 1, 1};
+
+	size_t elem_number2[3] = { 7, 3, 4 };
+
+	double src[4][6][10];
+	double dst[4][6][10];
+	double dst_ref[4][6][10];
+
+
+
+	AML_LAYOUT_NATIVE_DECL(src_layout, 3);
+	AML_LAYOUT_NATIVE_DECL(dst_layout, 3);
+	AML_LAYOUT_PAD_DECL(src_pad, 3, sizeof(double));
+
+	aml_layout_native_ainit(&src_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)src, sizeof(double), 3, elem_number,
+				src_stride, src_pitch);
+	aml_layout_native_ainit(&dst_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst, sizeof(double), 3, elem_number2,
+				src_stride, src_pitch);
+
+	double neutral = 1337.0;
+	aml_layout_pad_ainit(&src_pad, AML_TYPE_LAYOUT_COLUMN_ORDER,
+			     &src_layout, elem_number2, (void*)&neutral);
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+				dst[k][j][i] = 0.0;
+				dst_ref[k][j][i] = 0.0;
+			}
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 7; i++)
+				dst_ref[k][j][i] = 1337.0;
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				dst_ref[k][j][i] = src[k][j][i];
+	aml_copy_layout_generic(&dst_layout, &src_pad);
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++)
+				assert(dst_ref[k][j][i] == dst[k][j][i]);
+}
+void test_transpose_layout(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 3, 2, 4, 5 };
+	size_t c_src_pitch[4] = { 10, 6, 4, 8 };
+	size_t src_stride[4] = { 2, 2, 2, 2 };
+	size_t c_dst_pitch[4] = { 3, 2, 4, 5 };
+	size_t dst_stride[4] = { 1, 1, 1, 1 };
+
+	double src[8][4][6][10];
+	double dst[5][4][2][3];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][2][3];
+	double ref_dst2[8][4][6][10];
+
+	AML_LAYOUT_NATIVE_DECL(src_layout, 4);
+	AML_LAYOUT_NATIVE_DECL(dst_layout, 4);
+	AML_LAYOUT_NATIVE_DECL(dst2_layout, 4);
+
+	aml_layout_native_ainit(&src_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)src, sizeof(double), 4, elem_number,
+				src_stride, c_src_pitch);
+	aml_layout_native_ainit(&dst_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst, sizeof(double), 4, elem_number2,
+				dst_stride, c_dst_pitch);
+	aml_layout_native_ainit(&dst2_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst2, sizeof(double), 4, elem_number,
+				src_stride, c_src_pitch);
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][k][j] = 0.0;
+					ref_dst[i][l][k][j] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+					ref_dst2[2 * l][2 * k][2 * j][2 * i] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+				}
+
+	aml_copy_layout_transpose_native(&dst_layout, &src_layout);
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][k][j] ==
+					       dst[i][l][k][j]);
+
+	aml_copy_layout_reverse_transpose_native(&dst2_layout, &dst_layout);
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+void test_transpose_layout_generic(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 3, 2, 4, 5 };
+	size_t c_src_pitch[4] = { 10, 6, 4, 8 };
+	size_t src_stride[4] = { 2, 2, 2, 2 };
+	size_t c_dst_pitch[4] = { 3, 2, 4, 5 };
+	size_t dst_stride[4] = { 1, 1, 1, 1 };
+
+	double src[8][4][6][10];
+	double dst[5][4][2][3];
+	double dst2[8][4][6][10];
+
+	double ref_dst[5][4][2][3];
+	double ref_dst2[8][4][6][10];
+
+	AML_LAYOUT_NATIVE_DECL(src_layout, 4);
+	AML_LAYOUT_NATIVE_DECL(dst_layout, 4);
+	AML_LAYOUT_NATIVE_DECL(dst2_layout, 4);
+
+	aml_layout_native_ainit(&src_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)src, sizeof(double), 4, elem_number,
+				src_stride, c_src_pitch);
+	aml_layout_native_ainit(&dst_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst, sizeof(double), 4, elem_number2,
+				dst_stride, c_dst_pitch);
+	aml_layout_native_ainit(&dst2_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst2, sizeof(double), 4, elem_number,
+				src_stride, c_src_pitch);
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+					ref_dst2[l][k][j][i] = 0.0;
+					dst2[l][k][j][i] = 0.0;
+				}
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][k][j] = 0.0;
+					ref_dst[i][l][k][j] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+					ref_dst2[2 * l][2 * k][2 * j][2 * i] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+				}
+
+	aml_copy_layout_transpose_generic(&dst_layout, &src_layout);
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][k][j] ==
+					       dst[i][l][k][j]);
+
+	aml_copy_layout_reverse_transpose_generic(&dst2_layout, &dst_layout);
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++)
+					assert(ref_dst2[l][k][j][i] ==
+					       dst2[l][k][j][i]);
+
+}
+
+int main(int argc, char *argv[])
+{
+	test_copy_2d();
+	test_copy_t2d();
+	test_copy_3d();
+	test_copy_3d_c();
+	test_copy_3dstr();
+	test_copy_3dstr_c();
+	test_copy_t3d();
+	test_copy_rt3d();
+	test_copy_t4d();
+	test_copy_t4d_c();
+	test_copy_t4dstr();
+	test_copy_t4dstr_c();
+	test_copy_sh4d();
+	test_copy_sh4d_c();
+	test_copy_sh4dstr();
+	test_copy_sh4dstr_c();
+	test_copy_layout();
+	test_copy_layout_generic();
+	test_copy_layout_pad_generic();
+	test_transpose_layout();
+	test_transpose_layout_generic();
+	return 0;
+}
diff --git a/tests/dma_layout.c b/tests/dma_layout.c
new file mode 100644
index 00000000..3469b2e3
--- /dev/null
+++ b/tests/dma_layout.c
@@ -0,0 +1,111 @@
+#include <aml.h>
+#include <assert.h>
+
+void test_dma_copy_generic()
+{
+	size_t elem_number[3] = { 5, 3, 2 };
+	size_t c_src_pitch[3] = { 10, 6, 4 };
+	size_t src_stride[3] = { 1, 1, 1};
+	size_t c_dst_pitch[3] = { 5, 3, 2 };
+	size_t dst_stride[3] = { 1, 1, 1};
+
+	double src[4][6][10];
+	double dst[2][3][5];
+
+	double ref_dst[2][3][5];
+
+	AML_LAYOUT_NATIVE_DECL(src_layout, 3);
+	AML_LAYOUT_NATIVE_DECL(dst_layout, 3);
+	AML_DMA_LAYOUT_DECL(dma);
+
+	/* library initialization */
+
+	aml_layout_native_ainit(&src_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)src, sizeof(double), 3, elem_number,
+				src_stride, c_src_pitch);
+	aml_layout_native_ainit(&dst_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst, sizeof(double), 3, elem_number,
+				dst_stride, c_dst_pitch);
+	aml_dma_layout_init(&dma, 1, aml_copy_layout_generic, NULL);
+
+	for (int k = 0; k < 4; k++)
+		for (int j = 0; j < 6; j++)
+			for (int i = 0; i < 10; i++) {
+				src[k][j][i] =
+				    (double)(i + j * 10 + k * 10 * 6);
+			}
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++) {
+				dst[k][j][i] = 0.0;
+				ref_dst[k][j][i] = src[k][j][i];
+			}
+
+	aml_dma_copy(&dma, &dst_layout, &src_layout);
+	for (int k = 0; k < 2; k++)
+		for (int j = 0; j < 3; j++)
+			for (int i = 0; i < 5; i++)
+				assert(ref_dst[k][j][i] == dst[k][j][i]);
+	
+	aml_dma_layout_destroy(&dma);
+}
+
+void test_dma_transpose_generic(void)
+{
+	size_t elem_number[4] = { 5, 3, 2, 4 };
+	size_t elem_number2[4] = { 3, 2, 4, 5 };
+	size_t c_src_pitch[4] = { 10, 6, 4, 8 };
+	size_t src_stride[4] = { 2, 2, 2, 2 };
+	size_t c_dst_pitch[4] = { 3, 2, 4, 5 };
+	size_t dst_stride[4] = { 1, 1, 1, 1 };
+
+	double src[8][4][6][10];
+	double dst[5][4][2][3];
+
+	double ref_dst[5][4][2][3];
+
+	AML_LAYOUT_NATIVE_DECL(src_layout, 4);
+	AML_LAYOUT_NATIVE_DECL(dst_layout, 4);
+	AML_DMA_LAYOUT_DECL(dma);
+
+	aml_layout_native_ainit(&src_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)src, sizeof(double), 4, elem_number,
+				src_stride, c_src_pitch);
+	aml_layout_native_ainit(&dst_layout, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)dst, sizeof(double), 4, elem_number2,
+				dst_stride, c_dst_pitch);
+	aml_dma_layout_init(&dma, 1, aml_copy_layout_transpose_generic, NULL);
+
+	for (int l = 0; l < 8; l++)
+		for (int k = 0; k < 4; k++)
+			for (int j = 0; j < 6; j++)
+				for (int i = 0; i < 10; i++) {
+					src[l][k][j][i] =
+					    (double)(i + j * 10 + k * 10 * 6 +
+						     l * 10 * 6 * 4);
+				}
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++) {
+					dst[i][l][k][j] = 0.0;
+					ref_dst[i][l][k][j] =
+					    src[2 * l][2 * k][2 * j][2 * i];
+				}
+	aml_dma_copy(&dma, &dst_layout, &src_layout);
+	for (int l = 0; l < 4; l++)
+		for (int k = 0; k < 2; k++)
+			for (int j = 0; j < 3; j++)
+				for (int i = 0; i < 5; i++)
+					assert(ref_dst[i][l][k][j] ==
+					       dst[i][l][k][j]);
+	aml_dma_layout_destroy(&dma);
+}
+
+int main(int argc, char *argv[])
+{
+	aml_init(&argc, &argv);
+	test_dma_copy_generic();
+	aml_finalize();
+	return 0;
+}
diff --git a/tests/layout.c b/tests/layout.c
new file mode 100644
index 00000000..83400259
--- /dev/null
+++ b/tests/layout.c
@@ -0,0 +1,456 @@
+#include <aml.h>
+#include <assert.h>
+
+void test_slice_contiguous(void)
+{
+	int memory[6][5][4];
+	size_t dims_col[3] = {4, 5, 6};
+	size_t dims_row[3] = {6, 5, 4};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t offsets_col[3] = {2, 2, 3};
+	size_t offsets_row[3] = {3, 2, 2};
+
+	size_t new_dims_col[3] = {2, 3, 3};
+	size_t new_dims_row[3] = {3, 3, 2};
+
+
+        int l = 0;
+	for(size_t i = 0; i < 6; i++)
+	for(size_t j = 0; j < 5; j++)
+	for(size_t k = 0; k < 4; k++, l++)
+		memory[i][j][k] = l;
+
+	struct aml_layout *a;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	struct aml_layout *b = aml_layout_aslice(a, offsets_col, new_dims_col, stride);
+	assert(AML_TYPE_LAYOUT_COLUMN_ORDER == aml_layout_order(b));
+
+	for(size_t i = 0; i < 3; i++)
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+	{
+		assert( memory[i+3][j+2][k+2] == *(int *)aml_layout_deref(b, k, j, i));
+		fprintf(stderr, "%d == %d\n", memory[i+3][j+2][k+2], *(int *)aml_layout_deref(b, k, j, i));
+	}
+	free(a);
+	free(b);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+	b = aml_layout_aslice(a, offsets_row, new_dims_row, stride);
+	assert(AML_TYPE_LAYOUT_ROW_ORDER == aml_layout_order(b));
+
+	for(size_t i = 0; i < 3; i++)
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+	{
+		assert( memory[i+3][j+2][k+2] == *(int *)aml_layout_deref(b, i, j, k));
+		fprintf(stderr, "%d == %d\n", memory[i+3][j+2][k+2], *(int *)aml_layout_deref(b, i, j, k));
+	}
+	free(a);
+	free(b);
+
+}
+
+void test_slice_strided(void)
+{
+	int memory[12][5][8];
+
+	size_t dims_col[3] = {4, 5, 6};
+	size_t dims_row[3] = {6, 5, 4};
+
+	size_t stride[3] = {2, 1, 2};
+
+	size_t pitch_col[3] = {8, 5, 12};
+	size_t pitch_row[3] = {12, 5, 8};
+
+	size_t offsets_col[3] = {1, 2, 0};
+	size_t offsets_row[3] = {0, 2, 1};
+
+	size_t new_dims_col[3] = {2, 3, 3};
+	size_t new_dims_row[3] = {3, 3, 2};
+
+	size_t new_stride_col[3] = {2, 1, 1};
+	size_t new_stride_row[3] = {1, 1, 2};
+
+        int l = 0;
+	for(size_t i = 0; i < 12; i++)
+	for(size_t j = 0; j < 5; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		memory[i][j][k] = l;
+
+	struct aml_layout *a;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, pitch_col);
+	struct aml_layout *b = aml_layout_aslice(a, offsets_col, new_dims_col, new_stride_col);
+
+	for(size_t i = 0; i < 3; i++)
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+		assert( memory[stride[2] * (offsets_col[2] + new_stride_col[2] * i)][
+			       stride[1] * (offsets_col[1] + new_stride_col[1] * j)][
+			       stride[0] * (offsets_col[0] + new_stride_col[0] * k)] == *(int *)aml_layout_deref(b, k, j, i));
+
+	free(a);
+	free(b);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, pitch_row);
+	b = aml_layout_aslice(a, offsets_row, new_dims_row, new_stride_row);
+
+	for(size_t i = 0; i < 3; i++)
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+		assert( memory[stride[2] * (offsets_col[2] + new_stride_col[2] * i)][
+			       stride[1] * (offsets_col[1] + new_stride_col[1] * j)][
+			       stride[0] * (offsets_col[0] + new_stride_col[0] * k)] == *(int *)aml_layout_deref(b, i, j, k));
+
+	free(a);
+	free(b);
+
+}
+
+void test_reshape_contiguous(void)
+{
+	int memory[4*5*6];
+
+	size_t dims_col[3] = {4, 5, 6};
+	size_t dims_row[3] = {6, 5, 4};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t new_dims_col[2] = {24, 5};
+	size_t new_dims_row[2] = {5, 24};
+
+	int i;
+        for(i = 0; i < 4*5*6; i++)
+		memory[i] = i;
+
+	struct aml_layout *a;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	struct aml_layout *b = aml_layout_areshape(a, 2, new_dims_col);
+	assert(AML_TYPE_LAYOUT_COLUMN_ORDER == aml_layout_order(b));
+	struct aml_layout *c;
+	aml_layout_reshape_acreate(&c, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				   a, 2, new_dims_col);
+	assert(AML_TYPE_LAYOUT_COLUMN_ORDER == aml_layout_order(c));
+
+	i = 0;
+	for(size_t j = 0; j < 5; j++)
+		for(size_t k = 0; k < 24; k++, i++) {
+			assert(i == *(int *)aml_layout_deref(b, k, j));
+			assert(i == *(int *)aml_layout_deref(c, k, j));
+		}
+
+	free(a);
+	free(b);
+	free(c);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+	b = aml_layout_areshape(a, 2, new_dims_row);
+	assert(AML_TYPE_LAYOUT_ROW_ORDER == aml_layout_order(b));
+	aml_layout_reshape_acreate(&c, AML_TYPE_LAYOUT_ROW_ORDER,
+				   a, 2, new_dims_row);
+	assert(AML_TYPE_LAYOUT_ROW_ORDER == aml_layout_order(c));
+
+	i = 0;
+	for(size_t j = 0; j < 5; j++)
+		for(size_t k = 0; k < 24; k++, i++) {
+			assert(i == *(int *)aml_layout_deref(b, j, k));
+			assert(i == *(int *)aml_layout_deref(c, j, k));
+		}
+
+	free(a);
+	free(b);
+	free(c);
+}
+
+void test_reshape_discontiguous(void)
+{
+	int memory[7][6][5];
+
+	size_t dims_col[3] = {4, 5, 6};
+	size_t dims_row[3] = {6, 5, 4};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t pitch_col[3] = {5, 6, 7};
+	size_t pitch_row[3] = {7, 6, 5};
+
+	size_t new_dims_col[5] = {2, 2, 5, 2, 3};
+	size_t new_dims_row[5] = {3, 2, 5, 2, 2};
+
+	int i = 0;
+        for(int j = 0; j < 6; j++)
+		for(int k = 0; k < 5; k++)
+		        for(int l = 0; l < 4; l++, i++)
+				memory[j][k][l] = i;
+
+	struct aml_layout *a;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, pitch_col);
+	struct aml_layout *b = aml_layout_areshape(a, 5, new_dims_col);
+	struct aml_layout *c;
+	aml_layout_reshape_acreate(&c, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				   a, 5, new_dims_col);
+
+	i = 0;
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+	for(size_t l = 0; l < 5; l++)
+	for(size_t m = 0; m < 2; m++)
+	for(size_t n = 0; n < 2; n++, i++) {
+		assert(i == *(int *)aml_layout_deref(b, n, m, l, k, j));
+		assert(i == *(int *)aml_layout_deref(c, n, m, l, k, j));
+	}
+
+	free(a);
+	free(b);
+	free(c);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, pitch_row);
+	b = aml_layout_areshape(a, 5, new_dims_row);
+	aml_layout_reshape_acreate(&c, AML_TYPE_LAYOUT_ROW_ORDER,
+				   a, 5, new_dims_row);
+
+
+	i = 0;
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+	for(size_t l = 0; l < 5; l++)
+	for(size_t m = 0; m < 2; m++)
+	for(size_t n = 0; n < 2; n++, i++) {
+		assert(i == *(int *)aml_layout_deref(b, j, k, l, m, n));
+		assert(i == *(int *)aml_layout_deref(c, j, k, l, m, n));
+	}
+
+	free(a);
+	free(b);
+	free(c);
+}
+
+void test_reshape_strided(void)
+{
+	int memory[12][5][8];
+
+	size_t dims_col[3] = {4, 5, 6};
+	size_t dims_row[3] = {6, 5, 4};
+
+	size_t stride[3] = {2, 1, 2};
+
+	size_t pitch_col[3] = {8, 5, 12};
+	size_t pitch_row[3] = {12, 5, 8};
+
+	size_t new_dims_col[4] = {2, 10, 2, 3};
+	size_t new_dims_row[4] = {3, 2, 10, 2};
+
+	int i = 0;
+	for(int j = 0; j < 6; j++)
+		for(int k = 0; k < 5; k++)
+			for(int l = 0; l < 4; l++, i++)
+				memory[2*j][1*k][2*l] = i;
+
+	struct aml_layout *a;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, pitch_col);
+	struct aml_layout *b = aml_layout_areshape(a, 4, new_dims_col);
+	struct aml_layout *c;
+	aml_layout_reshape_acreate(&c, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				   a, 4, new_dims_col);
+
+	i = 0;
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+	for(size_t l = 0; l < 10; l++)
+	for(size_t m = 0; m < 2; m++, i++) {
+		assert(i == *(int *)aml_layout_deref(b, m, l, k, j));
+		assert(i == *(int *)aml_layout_deref(c, m, l, k, j));
+	}
+
+	free(a);
+	free(b);
+	free(c);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, pitch_row);
+	b = aml_layout_areshape(a, 4, new_dims_row);
+	aml_layout_reshape_acreate(&c, AML_TYPE_LAYOUT_ROW_ORDER,
+				   a, 4, new_dims_row);
+
+	i = 0;
+	for(size_t j = 0; j < 3; j++)
+	for(size_t k = 0; k < 2; k++)
+	for(size_t l = 0; l < 10; l++)
+	for(size_t m = 0; m < 2; m++, i++) {
+		assert(i == *(int *)aml_layout_deref(b, j, k, l, m));
+		assert(i == *(int *)aml_layout_deref(c, j, k, l, m));
+	}
+
+	free(a);
+	free(b);
+	free(c);
+}
+
+void test_base(void)
+{
+	struct aml_layout *a;
+	AML_LAYOUT_NATIVE_DECL(b, 5);
+
+	/* padd the dims to the closest multiple of 2 */
+	float memory[16][12][8][8][4];
+	size_t pitch[5] = {4, 8, 8, 12, 16};
+	size_t cpitch[6] = {4, 4*4, 4*4*8, 4*4*8*8, 4*4*8*8*12, 4*4*8*8*12*16};
+	size_t dims[5] = {2, 3, 7, 11, 13};
+	size_t stride[5] = {1, 2, 1, 1, 1};
+
+	size_t dims_col[5] = {2, 3, 7, 11, 13};
+        size_t dims_row[5] = {13, 11, 7, 3, 2};
+
+	size_t pitch_col[5] = {4, 8, 8, 12, 16};
+	size_t pitch_row[5] = {16, 12, 8, 8, 4};
+
+	size_t stride_col[5] = {1, 2, 1, 1, 1};
+	size_t stride_row[5] = {1, 1, 1, 2, 1};
+
+        for(size_t i = 0; i < 4*8*8*12*16; i++)
+		((float*)(&memory[0][0][0][0][0]))[i] = (float)i;
+
+
+	/* initialize column order layouts */
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(float), 5, dims_col,
+				  stride_col, pitch_col);
+	aml_layout_native_ainit(&b, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				(void *)memory, sizeof(float), 5, dims_col,
+				stride_col, pitch_col);
+
+	struct aml_layout_data_native *adataptr;
+	struct aml_layout_data_native *bdataptr;
+
+	adataptr = (struct aml_layout_data_native *)a->data;
+	bdataptr = (struct aml_layout_data_native *)b.data;
+	assert( (intptr_t)(adataptr->stride) - (intptr_t)(adataptr->dims)
+                == 5*sizeof(size_t) );
+	assert( (intptr_t)(adataptr->pitch) - (intptr_t)(adataptr->dims)
+                == 10*sizeof(size_t) );
+	assert( (intptr_t)(adataptr->cpitch) - (intptr_t)(adataptr->dims)
+                == 15*sizeof(size_t) );
+
+	/* some simple checks */
+	assert(!memcmp(adataptr->dims, dims, sizeof(size_t)*5));
+	assert(!memcmp(adataptr->stride, stride, sizeof(size_t)*5));
+	assert(!memcmp(adataptr->pitch, pitch, sizeof(size_t)*5));
+	assert(!memcmp(adataptr->cpitch, cpitch, sizeof(size_t)*6));
+	assert(!memcmp(bdataptr->dims, dims, sizeof(size_t)*5));
+	assert(!memcmp(bdataptr->stride, stride, sizeof(size_t)*5));
+	assert(!memcmp(bdataptr->pitch, pitch, sizeof(size_t)*5));
+	assert(!memcmp(bdataptr->cpitch, cpitch, sizeof(size_t)*6));
+
+	/* test column major subroutines */
+	size_t dims_res[5];
+	size_t coords_test_col[5] = { 1, 2, 3, 4, 5 };
+	void *test_addr;
+	void *res_addr = (void *)&memory[5][4][3][2*2][1];
+
+	aml_layout_adims(a, dims_res);
+	assert(!memcmp(dims_res, dims_col, sizeof(size_t)*5));
+	aml_layout_dims(a, dims_res,
+			   dims_res + 1,
+			   dims_res + 2,
+			   dims_res + 3,
+			   dims_res + 4);
+	assert(!memcmp(dims_res, dims_col, sizeof(size_t)*5));
+	test_addr = aml_layout_aderef(a, coords_test_col);
+	assert(res_addr == test_addr);
+	test_addr = aml_layout_deref(a, coords_test_col[0],
+					coords_test_col[1],
+					coords_test_col[2],
+					coords_test_col[3],
+					coords_test_col[4]);
+	assert(res_addr == test_addr);
+	assert(AML_TYPE_LAYOUT_COLUMN_ORDER == aml_layout_order(a));
+
+	free(a);
+
+	/* initialize row order layouts */
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER, (void *)memory,
+				  sizeof(float), 5, dims_row, stride_row,
+				  pitch_row);
+	aml_layout_native_ainit(&b, AML_TYPE_LAYOUT_ROW_ORDER, (void *)memory,
+				sizeof(float), 5, dims_row, stride_row,
+				pitch_row);
+
+	adataptr = (struct aml_layout_data_native *)a->data;
+	bdataptr = (struct aml_layout_data_native *)b.data;
+	assert( (intptr_t)(adataptr->stride) - (intptr_t)(adataptr->dims)
+                == 5*sizeof(size_t) );
+	assert( (intptr_t)(adataptr->pitch) - (intptr_t)(adataptr->dims)
+                == 10*sizeof(size_t) );
+	assert( (intptr_t)(adataptr->cpitch) - (intptr_t)(adataptr->dims)
+                == 15*sizeof(size_t) );
+
+	/* some simple checks */
+	assert(!memcmp(adataptr->dims, dims, sizeof(size_t)*5));
+	assert(!memcmp(adataptr->stride, stride, sizeof(size_t)*5));
+	assert(!memcmp(adataptr->pitch, pitch, sizeof(size_t)*5));
+	assert(!memcmp(adataptr->cpitch, cpitch, sizeof(size_t)*6));
+	assert(!memcmp(bdataptr->dims, dims, sizeof(size_t)*5));
+	assert(!memcmp(bdataptr->stride, stride, sizeof(size_t)*5));
+	assert(!memcmp(bdataptr->pitch, pitch, sizeof(size_t)*5));
+	assert(!memcmp(bdataptr->cpitch, cpitch, sizeof(size_t)*6));
+
+	/* test row major subroutines */
+	size_t coords_test_row[5] = { 5, 4, 3, 2, 1 };
+	aml_layout_adims(a, dims_res);
+	assert(!memcmp(dims_res, dims_row, sizeof(size_t)*5));
+	aml_layout_dims(a, dims_res,
+			   dims_res + 1,
+			   dims_res + 2,
+			   dims_res + 3,
+			   dims_res + 4);
+	assert(!memcmp(dims_res, dims_row, sizeof(size_t)*5));
+	test_addr = aml_layout_aderef(a, coords_test_row);
+	assert(res_addr == test_addr);
+	test_addr = aml_layout_deref(a, coords_test_row[0],
+					coords_test_row[1],
+					coords_test_row[2],
+					coords_test_row[3],
+					coords_test_row[4]);
+	assert(res_addr == test_addr);
+	assert(AML_TYPE_LAYOUT_ROW_ORDER == aml_layout_order(a));
+
+	free(a);
+}
+int main(int argc, char *argv[])
+{
+	/* library initialization */
+	aml_init(&argc, &argv);
+
+	test_base();
+	test_reshape_contiguous();
+	test_reshape_discontiguous();
+	test_reshape_strided();
+
+	test_slice_contiguous();
+	test_slice_strided();
+
+	aml_finalize();
+	return 0;
+}
+
diff --git a/tests/tiling_nd.c b/tests/tiling_nd.c
new file mode 100644
index 00000000..6ad02e05
--- /dev/null
+++ b/tests/tiling_nd.c
@@ -0,0 +1,631 @@
+#include <aml.h>
+#include <assert.h>
+
+void test_tiling_collapse(void)
+{
+	int memory[9][8][4][3];
+	int memoryres[9][8][4][3];
+	size_t dims_col[4] = {3, 4, 8, 9};
+	size_t dims_row[4] = {9, 8, 4, 3};
+
+	size_t stride[4] = {1, 1, 1, 1};
+	size_t dims_tile_col[4] = {1, 4, 1, 9};
+	size_t dims_tile_row[4] = {9, 1, 4, 1};
+
+	size_t expected_dims_col[2] = {3, 8};
+	size_t expected_dims_row[2] = {8, 3};
+
+	int n = 0;
+	for(size_t i = 0; i < 9; i++)
+	for(size_t k = 0; k < 8; k++)
+	for(size_t l = 0; l < 4; l++)
+	for(size_t m = 0; m < 3; m++, n++) {
+		memory[i][k][l][m] = n;
+		memoryres[i][k][l][m] = 0;
+	}
+	struct aml_layout *a, *ares;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 4, dims_col,
+				  stride, dims_col);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memoryres, sizeof(int), 4, dims_row,
+				  stride, dims_row);
+
+	struct aml_tiling_nd *t, *tres;
+	aml_tiling_nd_collapse_acreate(&t, AML_TYPE_TILING_COLUMN_ORDER,
+				       a, 4, dims_tile_col);
+	aml_tiling_nd_collapse_acreate(&tres, AML_TYPE_TILING_ROW_ORDER,
+				       ares, 4, dims_tile_row);
+
+	assert(aml_tiling_nd_ndims(t) == 2);
+	assert(aml_tiling_nd_ndims(tres) == 2);
+
+	size_t dims[2] = { 0, 0};
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_col, 2 * sizeof(int)) == 0);
+	memset(dims, 0, 2 * sizeof(int));
+	aml_tiling_nd_adims(tres, dims);
+	assert(memcmp(dims, expected_dims_row, 2 * sizeof(int)) == 0);
+
+	for(size_t i = 0; i < expected_dims_col[1]; i++)
+	for(size_t j = 0; j < expected_dims_col[0]; j++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, j, i);
+		bres = aml_tiling_nd_index(tres, i, j);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 8 * 9 * 4 * 3 * sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+}
+
+void test_tiling_even_mixed(void)
+{
+	int memory[9][10][8];
+	int memoryres[9][10][8];
+	size_t dims_col[3] = {8, 10, 9};
+	size_t dims_row[3] = {9, 10, 8};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t dims_tile_col[3] = {4, 10, 3};
+	size_t dims_tile_row[3] = {3, 10, 4};
+
+	size_t expected_dims_col[3] = {2, 1, 3};
+	size_t expected_dims_row[3] = {3, 1, 2};
+
+        int l = 0;
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++) {
+		memory[i][j][k] = l;
+		memoryres[i][j][k] = 0;
+	}
+
+	struct aml_layout *a, *ares;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+
+
+	struct aml_tiling_nd *t, *tres;
+	aml_tiling_nd_resize_acreate(&t, AML_TYPE_TILING_COLUMN_ORDER,
+				     a, 3, dims_tile_col);
+	aml_tiling_nd_resize_acreate(&tres, AML_TYPE_TILING_ROW_ORDER,
+				     ares, 3, dims_tile_row);
+
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, k, j, i);
+		bres = aml_tiling_nd_index(tres, i, j, k);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 8 * 10 * 9 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+
+
+	aml_tiling_nd_resize_acreate(&t, AML_TYPE_TILING_ROW_ORDER,
+				     a, 3, dims_tile_row);
+	aml_tiling_nd_resize_acreate(&tres, AML_TYPE_TILING_COLUMN_ORDER,
+				     ares, 3, dims_tile_col);
+
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		memoryres[i][j][k] = 0.0;
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, i, j, k);
+		bres = aml_tiling_nd_index(tres, k, j, i);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 8 * 10 * 9 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+}
+
+void test_tiling_even(void)
+{
+	int memory[9][10][8];
+	int memoryres[9][10][8];
+	size_t dims_col[3] = {8, 10, 9};
+	size_t dims_row[3] = {9, 10, 8};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t dims_tile_col[3] = {4, 10, 3};
+	size_t dims_tile_row[3] = {3, 10, 4};
+
+	size_t expected_dims_col[3] = {2, 1, 3};
+	size_t expected_dims_row[3] = {3, 1, 2};
+
+        int l = 0;
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++) {
+		memory[i][j][k] = l;
+		memoryres[i][j][k] = 0.0;
+	}
+
+	struct aml_layout *a, *ares;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+
+
+	struct aml_tiling_nd *t, *tres;
+	aml_tiling_nd_resize_acreate(&t, AML_TYPE_TILING_COLUMN_ORDER,
+				     a, 3, dims_tile_col);
+	aml_tiling_nd_resize_acreate(&tres, AML_TYPE_TILING_COLUMN_ORDER,
+				     ares, 3, dims_tile_col);
+
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_COLUMN_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+        size_t dims[3];
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_col, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_col, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, k, j, i);
+		bres = aml_tiling_nd_index(tres, k, j, i);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 8 * 10 * 9 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+
+
+	aml_tiling_nd_resize_acreate(&t, AML_TYPE_TILING_ROW_ORDER,
+				     a, 3, dims_tile_row);
+	aml_tiling_nd_resize_acreate(&tres, AML_TYPE_TILING_ROW_ORDER,
+				     ares, 3, dims_tile_row);
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_ROW_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_row, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_row, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		memoryres[i][j][k] = 0.0;
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, i, j, k);
+		bres = aml_tiling_nd_index(tres, i, j, k);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 8 * 10 * 9 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+}
+
+void test_tiling_uneven(void)
+{
+
+	int memory[8][10][7];
+	int memoryres[8][10][7];
+	size_t dims_col[3] = {7, 10, 8};
+	size_t dims_row[3] = {8, 10, 7};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t dims_tile_col[3] = {4, 10, 3};
+	size_t dims_tile_row[3] = {3, 10, 4};
+
+	size_t expected_dims_col[3] = {2, 1, 3};
+	size_t expected_dims_row[3] = {3, 1, 2};
+
+        int l = 0;
+	for(size_t i = 0; i < 8; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 7; k++, l++) {
+		memory[i][j][k] = l;
+		memoryres[i][j][k] = 0.0;
+	}
+
+	struct aml_layout *a, *ares;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+
+
+	struct aml_tiling_nd *t, *tres;
+	aml_tiling_nd_resize_acreate(&t, AML_TYPE_TILING_COLUMN_ORDER,
+				     a, 3, dims_tile_col);
+	aml_tiling_nd_resize_acreate(&tres, AML_TYPE_TILING_COLUMN_ORDER,
+				     ares, 3, dims_tile_col);
+
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_COLUMN_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+        size_t dims[3];
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_col, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_col, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, k, j, i);
+		bres = aml_tiling_nd_index(tres, k, j, i);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 7 * 10 * 8 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+
+	aml_tiling_nd_resize_acreate(&t, AML_TYPE_TILING_ROW_ORDER,
+				     a, 3, dims_tile_row);
+	aml_tiling_nd_resize_acreate(&tres, AML_TYPE_TILING_ROW_ORDER,
+				     ares, 3, dims_tile_row);
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_ROW_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_row, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_row, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < 8; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 7; k++, l++)
+		memoryres[i][j][k] = 0.0;
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, i, j, k);
+		bres = aml_tiling_nd_index(tres, i, j, k);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 7 * 10 * 8 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+}
+
+void test_tiling_pad_even(void)
+{
+	int memory[9][10][8];
+	int memoryres[9][10][8];
+	size_t dims_col[3] = {8, 10, 9};
+	size_t dims_row[3] = {9, 10, 8};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t dims_tile_col[3] = {4, 10, 3};
+	size_t dims_tile_row[3] = {3, 10, 4};
+
+	size_t expected_dims_col[3] = {2, 1, 3};
+	size_t expected_dims_row[3] = {3, 1, 2};
+
+        int l = 0;
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++) {
+		memory[i][j][k] = l;
+		memoryres[i][j][k] = 0.0;
+	}
+
+	struct aml_layout *a, *ares;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+
+
+	struct aml_tiling_nd *t, *tres;
+	int neutral = 0xdeadbeef;
+	aml_tiling_nd_pad_acreate(&t, AML_TYPE_TILING_COLUMN_ORDER,
+				  a, 3, dims_tile_col, &neutral);
+	aml_tiling_nd_pad_acreate(&tres, AML_TYPE_TILING_COLUMN_ORDER,
+				  ares, 3, dims_tile_col, &neutral);
+
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_COLUMN_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+        size_t dims[3];
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_col, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_col, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, k, j, i);
+		bres = aml_tiling_nd_index(tres, k, j, i);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 8 * 10 * 9 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+
+	aml_tiling_nd_pad_acreate(&t, AML_TYPE_TILING_ROW_ORDER,
+				  a, 3, dims_tile_row, &neutral);
+	aml_tiling_nd_pad_acreate(&tres, AML_TYPE_TILING_ROW_ORDER,
+				  ares, 3, dims_tile_row, &neutral);
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_ROW_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_row, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_row, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		memoryres[i][j][k] = 0.0;
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, i, j, k);
+		bres = aml_tiling_nd_index(tres, i, j, k);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	assert(memcmp(memory, memoryres, 8 * 10 * 9 *sizeof(int)) == 0);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+}
+
+void test_tiling_pad_uneven(void)
+{
+
+	int memory[8][10][7];
+	int memoryres[9][10][8];
+	size_t dims_col[3] = {7, 10, 8};
+	size_t dims_row[3] = {8, 10, 7};
+	size_t dims_col_res[3] = {8, 10, 9};
+	size_t dims_row_res[3] = {9, 10, 8};
+
+	size_t stride[3] = {1, 1, 1};
+
+	size_t dims_tile_col[3] = {4, 10, 3};
+	size_t dims_tile_row[3] = {3, 10, 4};
+
+	size_t expected_dims_col[3] = {2, 1, 3};
+	size_t expected_dims_row[3] = {3, 1, 2};
+
+        int l = 0;
+	for(size_t i = 0; i < 8; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 7; k++, l++)
+		memory[i][j][k] = l;
+
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		memoryres[i][j][k] = 0.0;
+
+
+	struct aml_layout *a, *ares;
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_col,
+				  stride, dims_col);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_COLUMN_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_col_res,
+				  stride, dims_col_res);
+
+
+	struct aml_tiling_nd *t, *tres;
+	int neutral = 0xdeadbeef;
+	aml_tiling_nd_pad_acreate(&t, AML_TYPE_TILING_COLUMN_ORDER,
+				  a, 3, dims_tile_col, &neutral);
+	aml_tiling_nd_pad_acreate(&tres, AML_TYPE_TILING_COLUMN_ORDER,
+				  ares, 3, dims_tile_col, &neutral);
+
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_COLUMN_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+        size_t dims[3];
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_col, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_col, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, k, j, i);
+		bres = aml_tiling_nd_index(tres, k, j, i);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		if( k >= 7 || i >= 8)
+			assert(memoryres[i][j][k] == 0xdeadbeef);
+		else
+			assert(memoryres[i][j][k] == memory[i][j][k]);
+
+	free(a);
+	free(t);
+
+	aml_layout_native_acreate(&a, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memory, sizeof(int), 3, dims_row,
+				  stride, dims_row);
+	aml_layout_native_acreate(&ares, AML_TYPE_LAYOUT_ROW_ORDER,
+				  (void *)memoryres, sizeof(int), 3, dims_row_res,
+				  stride, dims_row_res);
+
+	aml_tiling_nd_pad_acreate(&t, AML_TYPE_TILING_ROW_ORDER,
+				  a, 3, dims_tile_row, &neutral);
+	aml_tiling_nd_pad_acreate(&tres, AML_TYPE_TILING_ROW_ORDER,
+				  ares, 3, dims_tile_row, &neutral);
+
+	assert(aml_tiling_nd_order(t) == AML_TYPE_TILING_ROW_ORDER);
+	assert(aml_tiling_nd_ndims(t) == 3);
+
+	aml_tiling_nd_tile_adims(t, dims);
+	assert(memcmp(dims, dims_tile_row, 3*sizeof(size_t)) == 0);
+	aml_tiling_nd_adims(t, dims);
+	assert(memcmp(dims, expected_dims_row, 3*sizeof(size_t)) == 0);
+
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		memoryres[i][j][k] = 0.0;
+
+	for(size_t i = 0; i < expected_dims_col[2]; i++)
+	for(size_t j = 0; j < expected_dims_col[1]; j++)
+	for(size_t k = 0; k < expected_dims_col[0]; k++) {
+		struct aml_layout *b, *bres;
+		b = aml_tiling_nd_index(t, i, j, k);
+		bres = aml_tiling_nd_index(tres, i, j, k);
+		aml_copy_layout_generic(bres, b);
+		free(b);
+		free(bres);
+	}
+	for(size_t i = 0; i < 9; i++)
+	for(size_t j = 0; j < 10; j++)
+	for(size_t k = 0; k < 8; k++, l++)
+		if( k >= 7 || i >= 8)
+			assert(memoryres[i][j][k] == 0xdeadbeef);
+		else
+			assert(memoryres[i][j][k] == memory[i][j][k]);
+
+	free(a);
+	free(ares);
+	free(t);
+	free(tres);
+
+}
+
+int main(int argc, char *argv[])
+{
+	/* library initialization */
+	aml_init(&argc, &argv);
+
+	test_tiling_even();
+	test_tiling_uneven();
+	test_tiling_even_mixed();
+	test_tiling_pad_even();
+	test_tiling_pad_uneven();
+	test_tiling_collapse();
+
+	return 0;
+}
+