forked from OSchip/llvm-project
[mlir][NVGPU] Verifier for nvgpu.ldmatrix
* Adds verifiers for `nvgpu.ldmatrix` op * Adds tests to `mlir/test/Dialect/NVGPU/invalid.mlir` Reviewed By: ThomasRaoux Differential Revision: https://reviews.llvm.org/D129669
This commit is contained in:
parent
d63ec445ca
commit
713d3de5fb
|
@ -53,8 +53,10 @@ def NVGPU_DeviceAsyncToken : DialectType<
|
|||
class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
|
||||
Op<NVGPU_Dialect, mnemonic, traits> {}
|
||||
|
||||
def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix",
|
||||
[MemoryEffects<[MemRead]>]> {
|
||||
def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
|
||||
MemoryEffects<[MemRead]>,
|
||||
PredOpTrait<"srcMemref and res have same element type",
|
||||
TCresVTEtIsSameAsOp<0, 0>>]> {
|
||||
let description = [{
|
||||
The `nvgpu.ldmatrix` op represents loading a matrix fragment from
|
||||
memory. The load source and result type must be compatible with lowering
|
||||
|
@ -79,12 +81,14 @@ def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix",
|
|||
let assemblyFormat = [{
|
||||
$srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
|
||||
}];
|
||||
|
||||
let hasVerifier = 1;
|
||||
}
|
||||
|
||||
def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [
|
||||
NoSideEffect,
|
||||
PredOpTrait<"matrixA and matrixB have same element type", TCopVTEtIsSameAs<0, 1>>,
|
||||
]> {
|
||||
NoSideEffect,
|
||||
PredOpTrait<"matrixA and matrixB have same element type",
|
||||
TCopVTEtIsSameAs<0, 1>>]> {
|
||||
let description = [{
|
||||
The `nvgpu.mma.sync` op represents the distributed form of a collective
|
||||
matrix-multiply-and-accumulate (mma) operation that is compatible with
|
||||
|
@ -120,8 +124,8 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [
|
|||
}
|
||||
|
||||
|
||||
def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",
|
||||
[AttrSizedOperandSegments]> {
|
||||
def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
|
||||
AttrSizedOperandSegments]> {
|
||||
let summary = "device-side asynchronous copy";
|
||||
let description = [{
|
||||
The `gpu.device_async_copy` op initiates an asynchronous copy operation of
|
||||
|
|
|
@ -88,6 +88,10 @@ LogicalResult DeviceAsyncCopyOp::verify() {
|
|||
return success();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// NVGPU_MmaSyncOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
LogicalResult MmaSyncOp::verify() {
|
||||
|
||||
// Fundamental tensor core mma.sync op
|
||||
|
@ -186,5 +190,56 @@ LogicalResult MmaSyncOp::verify() {
|
|||
return success();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// NVGPU_LdMatrixOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
LogicalResult LdMatrixOp::verify() {
|
||||
|
||||
// ldmatrix reads data from source in shared memory
|
||||
auto srcMemref = getSrcMemref().getType().cast<MemRefType>();
|
||||
|
||||
// ldmatrix writes data to result/destination in vector registers
|
||||
auto resVector = getRes().getType().cast<VectorType>();
|
||||
|
||||
// vector register shape, element type, and bitwidth
|
||||
ArrayRef<int64_t> resShape = resVector.getShape();
|
||||
Type resType = resVector.getElementType();
|
||||
int64_t elementBitWidth = resType.getIntOrFloatBitWidth();
|
||||
|
||||
// ldmatrix loads 32 bits into vector registers per 8-by-8 tile per thread
|
||||
int64_t numElementsPer32b = 32 / elementBitWidth;
|
||||
|
||||
// number of 8-by-8 tiles
|
||||
int64_t numTiles = getNumTiles();
|
||||
|
||||
// transpose elements in vector registers at 16b granularity when true
|
||||
bool isTranspose = getTranspose();
|
||||
|
||||
// address space id for shared memory
|
||||
unsigned smemAddressSpace = gpu::GPUDialect::getWorkgroupAddressSpace();
|
||||
|
||||
//
|
||||
// verification
|
||||
//
|
||||
|
||||
if (!(srcMemref.getMemorySpaceAsInt() == smemAddressSpace))
|
||||
return emitError()
|
||||
<< "expected nvgpu.ldmatrix srcMemref must have memory space "
|
||||
<< smemAddressSpace;
|
||||
if (elementBitWidth > 32)
|
||||
return emitError() << "nvgpu.ldmatrix works for 32b or lower";
|
||||
if (isTranspose && !(elementBitWidth == 16))
|
||||
return emitError()
|
||||
<< "nvgpu.ldmatrix transpose works only at 16b granularity";
|
||||
if (!(resShape[1] == numElementsPer32b))
|
||||
return emitError() << "expected vector register shape[1] = "
|
||||
<< numElementsPer32b;
|
||||
if (!(resShape[0] == numTiles))
|
||||
return emitError()
|
||||
<< "expected vector register shape[0] and numTiles to match";
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
#define GET_OP_CLASSES
|
||||
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
|
||||
|
|
|
@ -1,4 +1,53 @@
|
|||
// RUN: mlir-opt -split-input-file -verify-diagnostics %s
|
||||
|
||||
func.func @ldmatrix_address_space_f16_x4(%arg0: memref<128x128xf16, 2>) -> vector<4x1xf16> {
|
||||
%c0 = arith.constant 0 : index
|
||||
// expected-error @+1 {{expected nvgpu.ldmatrix srcMemref must have memory space 3}}
|
||||
%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 2> -> vector<4x1xf16>
|
||||
return %a : vector<4x1xf16>
|
||||
}
|
||||
// -----
|
||||
|
||||
func.func @ldmatrix_num_elements_f16_x4(%arg0: memref<128x128xf16, 3>) -> vector<4x1xf16> {
|
||||
%c0 = arith.constant 0 : index
|
||||
// expected-error @+1 {{expected vector register shape[1] = 2}}
|
||||
%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<4x1xf16>
|
||||
return %a : vector<4x1xf16>
|
||||
}
|
||||
// -----
|
||||
|
||||
func.func @ldmatrix_num_tiles_f16_x4(%arg0: memref<128x128xf16, 3>) -> vector<2x2xf16> {
|
||||
%c0 = arith.constant 0 : index
|
||||
// expected-error @+1 {{expected vector register shape[0] and numTiles to match}}
|
||||
%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<2x2xf16>
|
||||
return %a : vector<2x2xf16>
|
||||
}
|
||||
// -----
|
||||
|
||||
func.func @ldmatrix_num_tiles_f32_x4(%arg0: memref<128x128xf32, 3>) -> vector<4x2xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
// expected-error @+1 {{expected vector register shape[1] = 1}}
|
||||
%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf32>
|
||||
return %a : vector<4x2xf32>
|
||||
}
|
||||
// -----
|
||||
|
||||
func.func @ldmatrix_trans_f32_x4(%arg0: memref<128x128xf32, 3>) -> vector<4x1xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
// expected-error @+1 {{nvgpu.ldmatrix transpose works only at 16b granularity}}
|
||||
%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = true, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x1xf32>
|
||||
return %a : vector<4x1xf32>
|
||||
}
|
||||
// -----
|
||||
|
||||
func.func @ldmatrix_type_x4(%arg0: memref<128x128xf32, 3>) -> vector<4x2xf16> {
|
||||
%c0 = arith.constant 0 : index
|
||||
// expected-error @+1 {{'nvgpu.ldmatrix' op failed to verify that srcMemref and res have same element type}}
|
||||
%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf16>
|
||||
return %a : vector<4x2xf16>
|
||||
}
|
||||
// -----
|
||||
|
||||
func.func @m16n8k16_fp16_vector_shape_a(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
|
||||
// expected-error @+1 {{expected 256 warp-wide matrix A elements}}
|
||||
%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
|
||||
|
|
Loading…
Reference in New Issue