forked from OSchip/llvm-project
Fix bugs in GPUToNVVM lowering
The current lowering from GPU to NVVM does not correctly handle the following cases when lowering the gpu shuffle op. 1. When the active width is set to 32 (all lanes), then the current approach computes (1 << 32) -1 which results in poison values in the LLVM IR. We fix this by defining the active mask as (-1) >> (32 - width). 2. In the case of shuffle up, the computation of the third operand c has to be different from the other 3 modes due to the op definition in the ISA reference. (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html) Specifically, the predicate value is computed as j >= maxLane for up and j <= maxLane for all other modes. We fix this by computing maskAndClamp as 32 - width for this mode. TEST: We modify the existing test and add more checks for the up mode. Reviewed By: ThomasRaoux Differential Revision: https://reviews.llvm.org/D118086
This commit is contained in:
parent
810f13f0eb
commit
e01e4c9115
|
@ -64,8 +64,10 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
|
||||||
/// the highest lane which participates in the shuffle).
|
/// the highest lane which participates in the shuffle).
|
||||||
///
|
///
|
||||||
/// %one = llvm.constant(1 : i32) : i32
|
/// %one = llvm.constant(1 : i32) : i32
|
||||||
/// %shl = llvm.shl %one, %width : i32
|
/// %minus_one = llvm.constant(-1 : i32) : i32
|
||||||
/// %active_mask = llvm.sub %shl, %one : i32
|
/// %thirty_two = llvm.constant(32 : i32) : i32
|
||||||
|
/// %num_lanes = llvm.sub %thirty_two, %width : i32
|
||||||
|
/// %active_mask = llvm.lshr %minus_one, %num_lanes : i32
|
||||||
/// %mask_and_clamp = llvm.sub %width, %one : i32
|
/// %mask_and_clamp = llvm.sub %width, %one : i32
|
||||||
/// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
|
/// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
|
||||||
/// %mask_and_clamp : !llvm<"{ float, i1 }">
|
/// %mask_and_clamp : !llvm<"{ float, i1 }">
|
||||||
|
@ -86,14 +88,24 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
|
||||||
|
|
||||||
Value one = rewriter.create<LLVM::ConstantOp>(
|
Value one = rewriter.create<LLVM::ConstantOp>(
|
||||||
loc, int32Type, rewriter.getI32IntegerAttr(1));
|
loc, int32Type, rewriter.getI32IntegerAttr(1));
|
||||||
// Bit mask of active lanes: `(1 << activeWidth) - 1`.
|
Value minusOne = rewriter.create<LLVM::ConstantOp>(
|
||||||
Value activeMask = rewriter.create<LLVM::SubOp>(
|
loc, int32Type, rewriter.getI32IntegerAttr(-1));
|
||||||
loc, int32Type,
|
Value thirtyTwo = rewriter.create<LLVM::ConstantOp>(
|
||||||
rewriter.create<LLVM::ShlOp>(loc, int32Type, one, adaptor.width()),
|
loc, int32Type, rewriter.getI32IntegerAttr(32));
|
||||||
one);
|
Value numLeadInactiveLane = rewriter.create<LLVM::SubOp>(
|
||||||
// Clamp lane: `activeWidth - 1`
|
loc, int32Type, thirtyTwo, adaptor.width());
|
||||||
Value maskAndClamp =
|
// Bit mask of active lanes: `(-1) >> (32 - activeWidth)`.
|
||||||
rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
|
Value activeMask = rewriter.create<LLVM::LShrOp>(loc, int32Type, minusOne,
|
||||||
|
numLeadInactiveLane);
|
||||||
|
Value maskAndClamp;
|
||||||
|
if (op.mode() == gpu::ShuffleMode::UP) {
|
||||||
|
// Clamp lane: `32 - activeWidth`
|
||||||
|
maskAndClamp = numLeadInactiveLane;
|
||||||
|
} else {
|
||||||
|
// Clamp lane: `activeWidth - 1`
|
||||||
|
maskAndClamp =
|
||||||
|
rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
|
||||||
|
}
|
||||||
|
|
||||||
auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
|
auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
|
||||||
Value shfl = rewriter.create<NVVM::ShflOp>(
|
Value shfl = rewriter.create<NVVM::ShflOp>(
|
||||||
|
|
|
@ -117,14 +117,23 @@ gpu.module @test_module {
|
||||||
// CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32
|
// CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32
|
||||||
%arg2 = arith.constant 23 : i32
|
%arg2 = arith.constant 23 : i32
|
||||||
// CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
|
// CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
|
||||||
// CHECK: %[[#SHL:]] = llvm.shl %[[#ONE]], %[[#WIDTH]] : i32
|
// CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
|
||||||
// CHECK: %[[#MASK:]] = llvm.sub %[[#SHL]], %[[#ONE]] : i32
|
// CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
|
||||||
|
// CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
|
||||||
|
// CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
|
||||||
// CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : i32
|
// CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : i32
|
||||||
// CHECK: %[[#SHFL:]] = nvvm.shfl.sync bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
|
// CHECK: %[[#SHFL:]] = nvvm.shfl.sync bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
|
||||||
// CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm.struct<(f32, i1)>
|
// CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm.struct<(f32, i1)>
|
||||||
// CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm.struct<(f32, i1)>
|
// CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm.struct<(f32, i1)>
|
||||||
%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
|
%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
|
||||||
// CHECK: nvvm.shfl.sync up {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
|
// CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
|
||||||
|
// CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
|
||||||
|
// CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
|
||||||
|
// CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
|
||||||
|
// CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
|
||||||
|
// CHECK: %[[#SHFL:]] = nvvm.shfl.sync up %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#NUM_LANES]] {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
|
||||||
|
// CHECK: llvm.extractvalue %[[#SHFL]][0 : index] : !llvm.struct<(f32, i1)>
|
||||||
|
// CHECK: llvm.extractvalue %[[#SHFL]][1 : index] : !llvm.struct<(f32, i1)>
|
||||||
%shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32
|
%shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32
|
||||||
// CHECK: nvvm.shfl.sync down {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
|
// CHECK: nvvm.shfl.sync down {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
|
||||||
%shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
|
%shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
|
||||||
|
|
Loading…
Reference in New Issue