ROCm · coderfeli · Mar 25, 2026 · Mar 20, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/include/flydsl/Dialect/Fly/IR/FlyOps.td b/include/flydsl/Dialect/Fly/IR/FlyOps.td
@@ -251,7 +251,7 @@ def Fly_CosizeOp : Fly_Op<"cosize", [Pure, DeclareOpInterfaceMethods<InferTypeOp
 }
 
 def Fly_Crd2IdxOp : Fly_Op<"crd2idx", [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
-  let arguments = (ins Fly_IntTuple:$coord, Fly_NarrowLayoutType:$layout);
+  let arguments = (ins Fly_IntTuple:$coord, AnyTypeOf<[Fly_NarrowLayoutType, Fly_Swizzle]>:$layout);
   let results = (outs Fly_IntTuple:$index);
 }
 def Fly_Idx2CrdOp : Fly_Op<"idx2crd", [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
@@ -337,7 +337,7 @@ def Fly_MakeMmaAtomOp : Fly_Op<"make_mma_atom", [Pure]> {
 def Fly_MakeCopyAtomOp : Fly_Op<"make_copy_atom", [Pure]> {
   let arguments = (ins I32Attr:$valBits);
   let results = (outs Fly_CopyAtom:$result);
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "attr-dict `:` qualified(type($result))";
 }
 
 def Fly_CopyAtomCall : Fly_Op<"copy_atom_call"> {
@@ -374,6 +374,7 @@ def Fly_TiledMmaPartitionOp : Fly_Op<"tiled_mma.partition", [Pure, DeclareOpInte
   let arguments = (ins Fly_MmaOperandAttr:$operand_id, Fly_TiledMma:$tiled_mma,
                        Fly_TensorLikeType:$input, Fly_IntTuple:$coord);
   let results = (outs Fly_TensorLikeType:$result);
+  let assemblyFormat = "`(` $operand_id `,` $tiled_mma `,` $input `,` $coord `)` attr-dict `:` functional-type(operands, results)";
 }
 def Fly_TiledMmaPartitionShapeOp : Fly_Op<"tiled_mma.partition_shape", [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let arguments = (ins Fly_MmaOperandAttr:$operand_id, Fly_TiledMma:$tiled_mma, Fly_IntTuple:$shape);
@@ -421,6 +422,10 @@ def Fly_ApplySwizzleOp : Fly_Op<"apply_swizzle", [Pure, DeclareOpInterfaceMethod
   let arguments = (ins Fly_Pointer:$ptr, Fly_Swizzle:$swizzle);
   let results = (outs Fly_Pointer:$result);
 }
+def Fly_DecompositionOp : Fly_Op<"decomposition", [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let arguments = (ins Fly_TensorLikeType:$tensor);
+  let results = (outs Fly_TensorLikeType:$result);
+}
 
 def Fly_PtrLoadOp : Fly_Op<"ptr.load", [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let arguments = (ins Fly_IteratorLikeType:$ptr);

diff --git a/include/flydsl/Dialect/Fly/Transforms/MemrefLowering.td b/include/flydsl/Dialect/Fly/Transforms/MemrefLowering.td
@@ -15,6 +15,27 @@ def : Pat<(Fly_AddOffsetOp Fly_IntTuple:$int_tuple, Fly_IntTuple:$offset),
 def : Pat<(Fly_AddOffsetOp (Fly_AddOffsetOp Fly_Pointer:$ptr, Fly_IntTuple:$offset1), Fly_IntTuple:$offset2),
           (Fly_AddOffsetOp $ptr, (Fly_IntTupleAddOp $offset1, $offset2))>;
 
+def : Pat<(Fly_DecompositionOp Fly_MemRef:$memref),
+          (replaceWithValue $memref),
+          [(Fly_SimpleLayoutMemRef $memref)]>;
+def : Pat<(Fly_DecompositionOp Fly_CoordTensor:$tensor),
+          (replaceWithValue $tensor),
+          [(Fly_SimpleLayoutCoordTensor $tensor)]>;
+def : Pat<(Fly_DecompositionOp Fly_MemRef:$memref),
+          (Fly_MakeViewOp
+            (Fly_AddOffsetOp (Fly_GetIterOp $memref),
+                             (Fly_Crd2IdxOp (Fly_ComposedGetOffsetOp (Fly_GetLayoutOp $memref)),
+                                            (Fly_ComposedGetInnerOp (Fly_GetLayoutOp $memref)))),
+            (Fly_ComposedGetOuterOp (Fly_GetLayoutOp $memref))),
+          [(Fly_ComposedLayoutMemRef $memref)]>;
+def : Pat<(Fly_DecompositionOp Fly_CoordTensor:$tensor),
+          (Fly_MakeViewOp
+            (Fly_AddOffsetOp (Fly_GetIterOp $tensor),
+                             (Fly_Crd2IdxOp (Fly_ComposedGetOffsetOp (Fly_GetLayoutOp $tensor)),
+                                            (Fly_ComposedGetInnerOp (Fly_GetLayoutOp $tensor)))),
+            (Fly_ComposedGetOuterOp (Fly_GetLayoutOp $tensor))),
+          [(Fly_ComposedLayoutCoordTensor $tensor)]>;
+
 def : Pat<(Fly_MemRefLoadOp Fly_MemRef:$memref, $indices),
           (Fly_PtrLoadOp (Fly_AddOffsetOp (Fly_GetIterOp $memref),
                                           (Fly_Crd2IdxOp $indices, (Fly_GetLayoutOp $memref))))>;

diff --git a/lib/Conversion/FlyToROCDL/BufferFatPtr.h b/lib/Conversion/FlyToROCDL/BufferFatPtr.h
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (c) 2025 FlyDSL Project Contributors
+
+#ifndef FLYDSL_LIB_CONVERSION_FLYTOROCDL_BUFFERFATPTR_H
+#define FLYDSL_LIB_CONVERSION_FLYTOROCDL_BUFFERFATPTR_H
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+#include "flydsl/Dialect/Fly/IR/FlyDialect.h"
+
+namespace mlir::fly {
+
+class BufferFatPtr {
+  static constexpr unsigned kRsrcAddrSpace = 8;   // BufferDesc
+  static constexpr unsigned kOffsetBitWidth = 32; // constrained by BufferCopy instruction
+
+  fly::PointerType ptrTy;
+  Value fatPtr;
+
+public:
+  BufferFatPtr(fly::PointerType ptrTy, Value v) : ptrTy(ptrTy), fatPtr(v) {
+    assert(ptrTy.getAddressSpace().getValue() == AddressSpace::BufferDesc);
+  }
+
+  static LLVM::LLVMStructType getType(MLIRContext *ctx) {
+    return LLVM::LLVMStructType::getLiteral(ctx, {LLVM::LLVMPointerType::get(ctx, kRsrcAddrSpace),
+                                                  IntegerType::get(ctx, kOffsetBitWidth)});
+  }
+  static Value pack(OpBuilder &b, Location loc, Value bufferRsrc, Value valOffset = nullptr) {
+    auto structTy = getType(b.getContext());
+    Value undef = LLVM::UndefOp::create(b, loc, structTy);
+    if (!valOffset) {
+      valOffset = arith::ConstantIntOp::create(b, loc, 0, kOffsetBitWidth);
+    }
+    Value withRsrc = LLVM::InsertValueOp::create(b, loc, undef, bufferRsrc, ArrayRef<int64_t>{0});
+    return LLVM::InsertValueOp::create(b, loc, withRsrc, valOffset, ArrayRef<int64_t>{1});
+  }
+
+  Value bufferRsrc(OpBuilder &b, Location loc) const {
+    return LLVM::ExtractValueOp::create(b, loc, fatPtr, ArrayRef<int64_t>{0});
+  }
+
+  Value valOffset(OpBuilder &b, Location loc) const {
+    return LLVM::ExtractValueOp::create(b, loc, fatPtr, ArrayRef<int64_t>{1});
+  }
+
+  Value byteOffset(OpBuilder &b, Location loc) const {
+    int64_t bits = ptrTy.getElemTy().getIntOrFloatBitWidth();
+    Value off = valOffset(b, loc);
+    if (bits == 8)
+      return off;
+    if (bits > 8 && bits % 8 == 0) {
+      int64_t elemBytes = bits / 8;
+      Value scale = arith::ConstantIntOp::create(b, loc, elemBytes, kOffsetBitWidth);
+      return arith::MulIOp::create(b, loc, off, scale);
+    }
+    Value scale = arith::ConstantIntOp::create(b, loc, bits, kOffsetBitWidth);
+    off = arith::MulIOp::create(b, loc, off, scale);
+    Value const8 = arith::ConstantIntOp::create(b, loc, 8, kOffsetBitWidth);
+    return arith::DivUIOp::create(b, loc, off, const8);
+  }
+
+  Value swizzleByteOffset(OpBuilder &b, Location loc) const {
+    Value off = byteOffset(b, loc);
+    SwizzleAttr swizzle = ptrTy.getSwizzle();
+    if (swizzle.isTrivialSwizzle())
+      return off;
+    auto offsetTy = IntegerType::get(b.getContext(), kOffsetBitWidth);
+    int64_t bitMaskValue = ((int64_t{1} << swizzle.getMask()) - 1)
+                           << (swizzle.getBase() + swizzle.getShift());
+    Value bitMask = arith::ConstantIntOp::create(b, loc, offsetTy, bitMaskValue);
+    Value shiftAmt = arith::ConstantIntOp::create(b, loc, offsetTy, swizzle.getShift());
+    Value masked = arith::AndIOp::create(b, loc, off, bitMask);
+    Value shifted = arith::ShRUIOp::create(b, loc, masked, shiftAmt);
+    return arith::XOrIOp::create(b, loc, off, shifted);
+  }
+
+  Value addOffset(OpBuilder &b, Location loc, Value delta) const {
+    Type offTy = IntegerType::get(b.getContext(), kOffsetBitWidth);
+    if (delta.getType() != offTy) {
+      if (delta.getType().isIndex())
+        delta = arith::IndexCastOp::create(b, loc, offTy, delta);
+      else if (delta.getType().getIntOrFloatBitWidth() < kOffsetBitWidth)
+        delta = arith::ExtSIOp::create(b, loc, offTy, delta);
+      else
+        delta = arith::TruncIOp::create(b, loc, offTy, delta);
+    }
+    Value newOff = arith::AddIOp::create(b, loc, valOffset(b, loc), delta);
+    return pack(b, loc, bufferRsrc(b, loc), newOff);
+  }
+};
+
+} // namespace mlir::fly
+
+#endif // FLYDSL_LIB_CONVERSION_FLYTOROCDL_BUFFERFATPTR_H