forked from OSchip/llvm-project
[AMDGPU] Add llvm.amdgcn.softwqm intrinsic
Add llvm.amdgcn.softwqm intrinsic which behaves like llvm.amdgcn.wqm only if there is other WQM computation in the shader. Reviewers: nhaehnle, tpr Reviewed By: nhaehnle Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64935 llvm-svn: 367097
This commit is contained in:
parent
9758407bf1
commit
00e89b428b
|
@ -1431,6 +1431,13 @@ def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
|
|||
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
// Copies the source value to the destination value, such that the source
|
||||
// is computed as if the entire program were executed in WQM if any other
|
||||
// program code executes in WQM.
|
||||
def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
|
||||
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
// Return true if at least one thread within the pixel quad passes true into
|
||||
// the function.
|
||||
def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
|
||||
|
|
|
@ -282,6 +282,7 @@ private:
|
|||
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
|
||||
void SelectDS_GWS(SDNode *N, unsigned IntrID);
|
||||
void SelectINTRINSIC_W_CHAIN(SDNode *N);
|
||||
void SelectINTRINSIC_WO_CHAIN(SDNode *N);
|
||||
void SelectINTRINSIC_VOID(SDNode *N);
|
||||
|
||||
protected:
|
||||
|
@ -908,6 +909,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
|||
SelectINTRINSIC_W_CHAIN(N);
|
||||
return;
|
||||
}
|
||||
case ISD::INTRINSIC_WO_CHAIN: {
|
||||
SelectINTRINSIC_WO_CHAIN(N);
|
||||
return;
|
||||
}
|
||||
case ISD::INTRINSIC_VOID: {
|
||||
SelectINTRINSIC_VOID(N);
|
||||
return;
|
||||
|
@ -2235,6 +2240,22 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
|
|||
SelectCode(N);
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
|
||||
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
|
||||
unsigned Opcode;
|
||||
switch (IntrID) {
|
||||
case Intrinsic::amdgcn_softwqm:
|
||||
Opcode = AMDGPU::SOFT_WQM;
|
||||
break;
|
||||
default:
|
||||
SelectCode(N);
|
||||
return;
|
||||
}
|
||||
|
||||
SDValue Src = N->getOperand(1);
|
||||
CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
|
||||
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
switch (IntrID) {
|
||||
|
|
|
@ -617,6 +617,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
|||
continue;
|
||||
case AMDGPU::COPY:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::WWM: {
|
||||
// If the destination register is a physical register there isn't really
|
||||
// much we can do to fix this.
|
||||
|
|
|
@ -3631,6 +3631,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
|
|||
case AMDGPU::PHI: return AMDGPU::PHI;
|
||||
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
|
||||
case AMDGPU::WQM: return AMDGPU::WQM;
|
||||
case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
|
||||
case AMDGPU::WWM: return AMDGPU::WWM;
|
||||
case AMDGPU::S_MOV_B32: {
|
||||
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
|
||||
|
@ -5506,6 +5507,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
|
|||
switch (UseMI.getOpcode()) {
|
||||
case AMDGPU::COPY:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::WWM:
|
||||
case AMDGPU::REG_SEQUENCE:
|
||||
case AMDGPU::PHI:
|
||||
|
@ -5623,6 +5625,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
|
|||
case AMDGPU::REG_SEQUENCE:
|
||||
case AMDGPU::INSERT_SUBREG:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::WWM: {
|
||||
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
|
||||
if (RI.hasAGPRs(SrcRC)) {
|
||||
|
|
|
@ -111,6 +111,10 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
|
|||
// WQM pass processes it.
|
||||
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
|
||||
// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
|
||||
// turned into a copy by WQM pass, but does not seed WQM requirements.
|
||||
def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
|
||||
// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
|
||||
// that the @earlyclobber is respected. The @earlyclobber is to make sure that
|
||||
// the instruction that defines $src0 (which is run in WWM) doesn't
|
||||
|
|
|
@ -312,6 +312,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
|||
char GlobalFlags = 0;
|
||||
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
|
||||
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
|
||||
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
|
||||
|
||||
// We need to visit the basic blocks in reverse post-order so that we visit
|
||||
// defs before uses, in particular so that we don't accidentally mark an
|
||||
|
@ -340,6 +341,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
|||
// correct, so we need it to be in WQM.
|
||||
Flags = StateWQM;
|
||||
LowerToCopyInstrs.push_back(&MI);
|
||||
} else if (Opcode == AMDGPU::SOFT_WQM) {
|
||||
LowerToCopyInstrs.push_back(&MI);
|
||||
SoftWQMInstrs.push_back(&MI);
|
||||
continue;
|
||||
} else if (Opcode == AMDGPU::WWM) {
|
||||
// The WWM intrinsic doesn't make the same guarantee, and plus it needs
|
||||
// to be executed in WQM or Exact so that its copy doesn't clobber
|
||||
|
@ -407,9 +412,12 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
|||
// Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
|
||||
// ever used anywhere in the function. This implements the corresponding
|
||||
// semantics of @llvm.amdgcn.set.inactive.
|
||||
// Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
|
||||
if (GlobalFlags & StateWQM) {
|
||||
for (MachineInstr *MI : SetInactiveInstrs)
|
||||
markInstruction(*MI, StateWQM, Worklist);
|
||||
for (MachineInstr *MI : SoftWQMInstrs)
|
||||
markInstruction(*MI, StateWQM, Worklist);
|
||||
}
|
||||
|
||||
return GlobalFlags;
|
||||
|
@ -885,7 +893,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
|
|||
unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
if (!(GlobalFlags & StateWQM)) {
|
||||
lowerLiveMaskQueries(Exec);
|
||||
if (!(GlobalFlags & StateWWM))
|
||||
if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
|
||||
return !LiveMaskQueries.empty();
|
||||
} else {
|
||||
// Store a copy of the original live mask when required
|
||||
|
|
|
@ -0,0 +1,188 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
|
||||
|
||||
; Check that WQM is not triggered by the softwqm intrinsic alone.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test1:
|
||||
;CHECK-NOT: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Check that the softwqm intrinsic works correctly for integers.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test2:
|
||||
;CHECK-NOT: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = bitcast float %out to i32
|
||||
%out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
|
||||
%out.2 = bitcast i32 %out.1 to float
|
||||
ret float %out.2
|
||||
}
|
||||
|
||||
; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_softwqm1:
|
||||
;CHECK-NOT: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_store_dword
|
||||
;CHECK-NOT; s_wqm_b64 exec, exec
|
||||
;CHECK: v_add_f32_e32
|
||||
define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
|
||||
%temp = fadd float %src0, %src1
|
||||
call void @llvm.amdgcn.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%out = fadd float %temp, %temp
|
||||
%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_softwqm2:
|
||||
;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
||||
;CHECK: buffer_store_dword
|
||||
;CHECK; s_wqm_b64 exec, exec
|
||||
;CHECK: v_add_f32_e32
|
||||
define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
|
||||
%temp = fadd float %src0, %src1
|
||||
%temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
|
||||
call void @llvm.amdgcn.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%out = fadd float %temp, %temp
|
||||
%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Make sure the transition from Exact to WWM then softwqm does not trigger WQM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_wwm1:
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_store_dword
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK-NOT: s_wqm_b64
|
||||
define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
|
||||
%temp = fadd float %src0, %src1
|
||||
%temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
|
||||
%out = fadd float %temp.0, %temp.0
|
||||
%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Check that softwqm on one case of branch does not trigger WQM for shader.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_control_flow_0:
|
||||
;CHECK-NEXT: ; %main_body
|
||||
;CHECK-NOT: s_wqm_b64 exec, exec
|
||||
;CHECK: %ELSE
|
||||
;CHECK: store
|
||||
;CHECK: %IF
|
||||
;CHECK: buffer_load
|
||||
;CHECK: buffer_load
|
||||
define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
|
||||
main_body:
|
||||
%cmp = icmp eq i32 %z, 0
|
||||
br i1 %cmp, label %IF, label %ELSE
|
||||
|
||||
IF:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
|
||||
%out = fadd float %src0, %src1
|
||||
%data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
|
||||
br label %END
|
||||
|
||||
ELSE:
|
||||
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
|
||||
br label %END
|
||||
|
||||
END:
|
||||
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
|
||||
ret float %r
|
||||
}
|
||||
|
||||
; Check that softwqm on one case of branch is treated as WQM in WQM shader.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_control_flow_1:
|
||||
;CHECK-NEXT: ; %main_body
|
||||
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
||||
;CHECK-NEXT: s_wqm_b64 exec, exec
|
||||
;CHECK: %ELSE
|
||||
;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
|
||||
;CHECK: store
|
||||
;CHECK: s_mov_b64 exec, [[SAVED]]
|
||||
;CHECK: %IF
|
||||
;CHECK-NOT: s_and_saveexec_b64
|
||||
;CHECK-NOT: s_and_b64 exec
|
||||
;CHECK: buffer_load
|
||||
;CHECK: buffer_load
|
||||
define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
|
||||
main_body:
|
||||
%c.bc = bitcast i32 %c to float
|
||||
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
||||
%tex0 = extractelement <4 x float> %tex, i32 0
|
||||
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
||||
%data.sample = extractelement <4 x float> %dtex, i32 0
|
||||
|
||||
%cmp = icmp eq i32 %z, 0
|
||||
br i1 %cmp, label %IF, label %ELSE
|
||||
|
||||
IF:
|
||||
%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
|
||||
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
|
||||
%out = fadd float %src0, %src1
|
||||
%data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
|
||||
br label %END
|
||||
|
||||
ELSE:
|
||||
call void @llvm.amdgcn.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
|
||||
br label %END
|
||||
|
||||
END:
|
||||
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
|
||||
ret float %r
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
|
||||
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
|
||||
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
|
||||
declare void @llvm.amdgcn.kill(i1) #1
|
||||
declare float @llvm.amdgcn.wqm.f32(float) #3
|
||||
declare float @llvm.amdgcn.softwqm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
|
||||
declare float @llvm.amdgcn.wwm.f32(float) #3
|
||||
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind readonly }
|
||||
attributes #3 = { nounwind readnone }
|
Loading…
Reference in New Issue