[mlir][gpu] Relax restriction on mma load/store op

Those ops can support more complex layout as long as the most inner
dimension is contiguous.

Differential Revision: https://reviews.llvm.org/D122452
This commit is contained in:
Thomas Raoux 2022-03-25 03:31:06 +00:00
parent b62ea9b38b
commit d77f483640
5 changed files with 44 additions and 17 deletions

View File

@ -65,7 +65,8 @@ getMemrefConstantHorizontalStride(ShapedType type) {
return 0;
int64_t offset = 0;
SmallVector<int64_t, 2> strides;
if (failed(getStridesAndOffset(memrefType, strides, offset)))
if (failed(getStridesAndOffset(memrefType, strides, offset)) ||
strides.back() != 1)
return llvm::None;
int64_t stride = strides[strides.size() - 2];
if (stride == ShapedType::kDynamicStrideOrOffset)

View File

@ -1068,6 +1068,17 @@ static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op,
// GPU_SubgroupMmaLoadMatrixOp
//===----------------------------------------------------------------------===//
/// Return true if the last dimension of the MemRefType has unit stride. Also
/// return true for memrefs with no strides.
static bool isLastMemrefDimUnitStride(MemRefType type) {
int64_t offset;
SmallVector<int64_t> strides;
if (failed(getStridesAndOffset(type, strides, offset))) {
return false;
}
return strides.back() == 1;
}
LogicalResult SubgroupMmaLoadMatrixOp::verify() {
auto srcType = srcMemref().getType();
auto resType = res().getType();
@ -1076,8 +1087,9 @@ LogicalResult SubgroupMmaLoadMatrixOp::verify() {
auto srcMemrefType = srcType.cast<MemRefType>();
auto srcMemSpace = srcMemrefType.getMemorySpaceAsInt();
if (!srcMemrefType.getLayout().isIdentity())
return emitError("expected identity layout map for source memref");
if (!isLastMemrefDimUnitStride(srcMemrefType))
return emitError(
"expected source memref most minor dim must have unit stride");
if (srcMemSpace != kGenericMemorySpace && srcMemSpace != kSharedMemorySpace &&
srcMemSpace != kGlobalMemorySpace)
@ -1102,8 +1114,10 @@ LogicalResult SubgroupMmaStoreMatrixOp::verify() {
auto srcMatrixType = srcType.cast<gpu::MMAMatrixType>();
auto dstMemrefType = dstType.cast<MemRefType>();
auto dstMemSpace = dstMemrefType.getMemorySpaceAsInt();
if (!dstMemrefType.getLayout().isIdentity())
return emitError("expected identity layout map for destination memref");
if (!isLastMemrefDimUnitStride(dstMemrefType))
return emitError(
"expected destination memref most minor dim must have unit stride");
if (dstMemSpace != kGenericMemorySpace && dstMemSpace != kSharedMemorySpace &&
dstMemSpace != kGlobalMemorySpace)
@ -1232,15 +1246,6 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
// GPU_DeviceAsyncCopyOp
//===----------------------------------------------------------------------===//
/// Return true if the last dimension of the MemRefType has unit stride. Also
/// return true for memrefs with no strides.
static bool isLastMemrefDimUnitStride(MemRefType type) {
int64_t offset;
SmallVector<int64_t> strides;
auto successStrides = getStridesAndOffset(type, strides, offset);
return succeeded(successStrides) && (strides.empty() || strides.back() == 1);
}
LogicalResult DeviceAsyncCopyOp::verify() {
auto srcMemref = src().getType().cast<MemRefType>();
auto dstMemref = dst().getType().cast<MemRefType>();

View File

@ -151,3 +151,22 @@ func @matmul_3Dmemref(%arg0: memref<2x16x16xf16>, %arg1: memref<16xf16>, %arg2:
vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16>
return
}
// CHECK-LABEL: func @matmul_memref_strided
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {leadDimension = 32 : index} : memref<2x16x16xf16, #{{.*}}> -> !gpu.mma_matrix<16x16xf16, "AOp">
// CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%[[C0]]] {leadDimension = 0 : index} : memref<16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">
// CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {leadDimension = 16 : index} : memref<2x16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
// CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
// CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<2x16x16xf16>
func @matmul_memref_strided(%arg0: memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, %arg1: memref<16xf16>, %arg2: memref<2x16x16xf16>) {
%cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f16
%A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16, affine_map<(d0, d1, d2) -> (d0 * 512 + d1 * 32 + d2)>>, vector<16x16xf16>
%B = vector.transfer_read %arg1[%c0], %cst {permutation_map = #map4, in_bounds = [true, true]} : memref<16xf16>, vector<16x16xf16>
%C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x16x16xf16>, vector<16x16xf16>
%D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<2x16x16xf16>
return
}

View File

@ -491,7 +491,7 @@ func @mmamatrix_invalid_element_type(){
func @mmaLoadOp_identity_layout(){
%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, #layout_map_col_major, 3>
%i = arith.constant 16 : index
// expected-error @+1 {{expected identity layout map for source memref}}
// expected-error @+1 {{expected source memref most minor dim must have unit stride}}
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, #layout_map_col_major, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
return
}
@ -514,7 +514,7 @@ func @wmmaStoreOp_invalid_map(%arg0 : !gpu.mma_matrix<16x16xf16, "COp">) -> () {
%sg = memref.alloca(){alignment = 32} : memref<32x32xf16, #layout_map_col_major, 3>
%i = arith.constant 16 : index
%j = arith.constant 16 : index
// expected-error @+1 {{expected identity layout map for destination memref}}
// expected-error @+1 {{expected destination memref most minor dim must have unit stride}}
gpu.subgroup_mma_store_matrix %arg0, %sg[%i,%j] {leadDimension= 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16,#layout_map_col_major, 3>
return
}

View File

@ -227,7 +227,7 @@ module attributes {gpu.container_module} {
return
}
func @mmamatrix_valid_element_type(){
func @mmamatrix_valid_element_type(%src : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>>){
// CHECK-LABEL: func @mmamatrix_valid_element_type
%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
// CHECK: %[[wg:.*]] = memref.alloca()
@ -237,6 +237,8 @@ module attributes {gpu.container_module} {
// CHECK: %[[cst:.*]] = arith.constant 1.000000e+00 : f32
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
// CHECK: gpu.subgroup_mma_load_matrix %[[wg]][%[[i]], %[[i]]] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
%s = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 64 : index} : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>> -> !gpu.mma_matrix<16x16xf16, "AOp">
// CHECK: gpu.subgroup_mma_load_matrix %{{.*}}[%[[i]], %[[i]]] {leadDimension = 64 : index} : memref<32x32xf16, #{{.*}}> -> !gpu.mma_matrix<16x16xf16, "AOp">
%1 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
// CHECK: gpu.subgroup_mma_elementwise addf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
%2 = gpu.subgroup_mma_elementwise addf %1, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">