forked from OSchip/llvm-project
AMDGPU: Move v_readlane lane select from VGPR to SGPR
Summary: Fix a compiler bug when the lane select happens to end up in a VGPR. Clarify the semantic of the corresponding intrinsic to be that of the corresponding GLSL: the lane select must be uniform across a wave front, otherwise results are undefined. Reviewers: arsenm Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D32343 llvm-svn: 301197
This commit is contained in:
parent
a266923d57
commit
5dea645138
|
@ -629,6 +629,8 @@ def int_amdgcn_readfirstlane :
|
|||
GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
|
||||
|
||||
// The lane argument must be uniform across the currently active threads of the
|
||||
// current wave. Otherwise, the result is undefined.
|
||||
def int_amdgcn_readlane :
|
||||
GCCBuiltin<"__builtin_amdgcn_readlane">,
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
|
||||
|
|
|
@ -2640,6 +2640,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
|
|||
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
|
||||
return;
|
||||
|
||||
// Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
|
||||
// lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
|
||||
// select is uniform.
|
||||
if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
|
||||
RI.isVGPR(MRI, Src1.getReg())) {
|
||||
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
|
||||
.add(Src1);
|
||||
Src1.ChangeToRegister(Reg, false);
|
||||
return;
|
||||
}
|
||||
|
||||
// We do not use commuteInstruction here because it is too aggressive and will
|
||||
// commute if it is possible. We only want to commute here if it improves
|
||||
// legality. This can be called a fairly large number of times so don't waste
|
||||
|
|
|
@ -19,6 +19,20 @@ define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %s
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}test_readlane_vregs:
|
||||
; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
|
||||
; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
|
||||
define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
|
||||
%args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
|
||||
%value = extractelement <2 x i32> %args, i32 0
|
||||
%lane = extractelement <2 x i32> %args, i32 1
|
||||
%readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
|
||||
store i32 %readlane, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: m0 should be folded.
|
||||
; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
|
||||
; CHECK: s_mov_b32 m0, -1
|
||||
|
@ -40,5 +54,8 @@ define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0)
|
|||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
|
||||
attributes #0 = { nounwind readnone convergent }
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind readnone }
|
||||
|
|
Loading…
Reference in New Issue