forked from OSchip/llvm-project
AMDGPU: Add new amdgcn.init.exec intrinsics
v2: More tests, bug fixes, cosmetic changes. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D31762 llvm-svn: 301677
This commit is contained in:
parent
894f8df2c8
commit
2d82590f64
|
@ -108,6 +108,21 @@ def int_amdgcn_implicit_buffer_ptr :
|
|||
GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
|
||||
Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], [IntrNoMem]>;
|
||||
|
||||
// Set EXEC to the 64-bit value given.
|
||||
// This is always moved to the beginning of the basic block.
|
||||
def int_amdgcn_init_exec : Intrinsic<[],
|
||||
[llvm_i64_ty], // 64-bit literal constant
|
||||
[IntrConvergent]>;
|
||||
|
||||
// Set EXEC according to a thread count packed in an SGPR input:
|
||||
// thread_count = (input >> bitoffset) & 0x7f;
|
||||
// This is always moved to the beginning of the basic block.
|
||||
def int_amdgcn_init_exec_from_input : Intrinsic<[],
|
||||
[llvm_i32_ty, // 32-bit SGPR input
|
||||
llvm_i32_ty], // bit offset of the thread count
|
||||
[IntrConvergent]>;
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Instruction Intrinsics
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -3516,6 +3516,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
NODE_NAME_CASE(KILL)
|
||||
NODE_NAME_CASE(DUMMY_CHAIN)
|
||||
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
|
||||
NODE_NAME_CASE(INIT_EXEC)
|
||||
NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
|
||||
NODE_NAME_CASE(SENDMSG)
|
||||
NODE_NAME_CASE(SENDMSGHALT)
|
||||
NODE_NAME_CASE(INTERP_MOV)
|
||||
|
|
|
@ -369,6 +369,8 @@ enum NodeType : unsigned {
|
|||
BUILD_VERTICAL_VECTOR,
|
||||
/// Pointer to the start of the shader's constant data.
|
||||
CONST_DATA_PTR,
|
||||
INIT_EXEC,
|
||||
INIT_EXEC_FROM_INPUT,
|
||||
SENDMSG,
|
||||
SENDMSGHALT,
|
||||
INTERP_MOV,
|
||||
|
|
|
@ -299,6 +299,15 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
|
|||
|
||||
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
|
||||
|
||||
def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
|
||||
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
|
||||
[SDNPHasChain, SDNPInGlue]>;
|
||||
|
||||
def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
|
||||
SDTypeProfile<0, 2,
|
||||
[SDTCisInt<0>, SDTCisInt<1>]>,
|
||||
[SDNPHasChain, SDNPInGlue]>;
|
||||
|
||||
def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
|
||||
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
|
||||
[SDNPHasChain, SDNPInGlue]>;
|
||||
|
|
|
@ -1957,6 +1957,63 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
|||
MI.eraseFromParent();
|
||||
return BB;
|
||||
|
||||
case AMDGPU::SI_INIT_EXEC:
|
||||
// This should be before all vector instructions.
|
||||
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
|
||||
AMDGPU::EXEC)
|
||||
.addImm(MI.getOperand(0).getImm());
|
||||
MI.eraseFromParent();
|
||||
return BB;
|
||||
|
||||
case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
|
||||
// Extract the thread count from an SGPR input and set EXEC accordingly.
|
||||
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
|
||||
//
|
||||
// S_BFE_U32 count, input, {shift, 7}
|
||||
// S_BFM_B64 exec, count, 0
|
||||
// S_CMP_EQ_U32 count, 64
|
||||
// S_CMOV_B64 exec, -1
|
||||
MachineInstr *FirstMI = &*BB->begin();
|
||||
MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||
unsigned InputReg = MI.getOperand(0).getReg();
|
||||
unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
||||
bool Found = false;
|
||||
|
||||
// Move the COPY of the input reg to the beginning, so that we can use it.
|
||||
for (auto I = BB->begin(); I != &MI; I++) {
|
||||
if (I->getOpcode() != TargetOpcode::COPY ||
|
||||
I->getOperand(0).getReg() != InputReg)
|
||||
continue;
|
||||
|
||||
if (I == FirstMI) {
|
||||
FirstMI = &*++BB->begin();
|
||||
} else {
|
||||
I->removeFromParent();
|
||||
BB->insert(FirstMI, &*I);
|
||||
}
|
||||
Found = true;
|
||||
break;
|
||||
}
|
||||
assert(Found);
|
||||
|
||||
// This should be before all vector instructions.
|
||||
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
|
||||
.addReg(InputReg)
|
||||
.addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
|
||||
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
|
||||
AMDGPU::EXEC)
|
||||
.addReg(CountReg)
|
||||
.addImm(0);
|
||||
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
|
||||
.addReg(CountReg, RegState::Kill)
|
||||
.addImm(64);
|
||||
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
|
||||
AMDGPU::EXEC)
|
||||
.addImm(-1);
|
||||
MI.eraseFromParent();
|
||||
return BB;
|
||||
}
|
||||
|
||||
case AMDGPU::GET_GROUPSTATICSIZE: {
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
|
||||
|
@ -3224,6 +3281,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|||
return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
|
||||
Op.getOperand(2), Glue);
|
||||
}
|
||||
case Intrinsic::amdgcn_init_exec: {
|
||||
return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
|
||||
Op.getOperand(2));
|
||||
}
|
||||
case Intrinsic::amdgcn_init_exec_from_input: {
|
||||
return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
|
||||
Op.getOperand(2), Op.getOperand(3));
|
||||
}
|
||||
case AMDGPUIntrinsic::SI_tbuffer_store: {
|
||||
SDValue Ops[] = {
|
||||
Chain,
|
||||
|
|
|
@ -286,6 +286,19 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
|
|||
let isReMaterializable = 1;
|
||||
}
|
||||
|
||||
def SI_INIT_EXEC : SPseudoInstSI <
|
||||
(outs), (ins i64imm:$src), []> {
|
||||
let Defs = [EXEC];
|
||||
let usesCustomInserter = 1;
|
||||
let isAsCheapAsAMove = 1;
|
||||
}
|
||||
|
||||
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
|
||||
(outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
|
||||
let Defs = [EXEC];
|
||||
let usesCustomInserter = 1;
|
||||
}
|
||||
|
||||
// Return for returning shaders to a shader variant epilog.
|
||||
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
|
||||
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
|
||||
|
@ -399,6 +412,16 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
|
|||
} // End SubtargetPredicate = isGCN
|
||||
|
||||
let Predicates = [isGCN] in {
|
||||
def : Pat <
|
||||
(AMDGPUinit_exec i64:$src),
|
||||
(SI_INIT_EXEC (as_i64imm $src))
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
(AMDGPUinit_exec_from_input i32:$input, i32:$shift),
|
||||
(SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
|
||||
>;
|
||||
|
||||
def : Pat<
|
||||
(AMDGPUtrap timm:$trapid),
|
||||
(S_TRAP $trapid)
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN
|
||||
|
||||
; GCN-LABEL: {{^}}full_mask:
|
||||
; GCN: s_mov_b64 exec, -1
|
||||
; GCN: v_add_f32_e32 v0,
|
||||
define amdgpu_ps float @full_mask(float %a, float %b) {
|
||||
main_body:
|
||||
%s = fadd float %a, %b
|
||||
call void @llvm.amdgcn.init.exec(i64 -1)
|
||||
ret float %s
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}partial_mask:
|
||||
; GCN: s_mov_b64 exec, 0x1e240
|
||||
; GCN: v_add_f32_e32 v0,
|
||||
define amdgpu_ps float @partial_mask(float %a, float %b) {
|
||||
main_body:
|
||||
%s = fadd float %a, %b
|
||||
call void @llvm.amdgcn.init.exec(i64 123456)
|
||||
ret float %s
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}input_s3off8:
|
||||
; GCN: s_bfe_u32 s0, s3, 0x70008
|
||||
; GCN: s_bfm_b64 exec, s0, 0
|
||||
; GCN: s_cmp_eq_u32 s0, 64
|
||||
; GCN: s_cmov_b64 exec, -1
|
||||
; GCN: v_add_f32_e32 v0,
|
||||
define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
|
||||
main_body:
|
||||
%s = fadd float %a, %b
|
||||
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
|
||||
ret float %s
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}input_s0off19:
|
||||
; GCN: s_bfe_u32 s0, s0, 0x70013
|
||||
; GCN: s_bfm_b64 exec, s0, 0
|
||||
; GCN: s_cmp_eq_u32 s0, 64
|
||||
; GCN: s_cmov_b64 exec, -1
|
||||
; GCN: v_add_f32_e32 v0,
|
||||
define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
|
||||
main_body:
|
||||
%s = fadd float %a, %b
|
||||
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
|
||||
ret float %s
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reuse_input:
|
||||
; GCN: s_bfe_u32 s1, s0, 0x70013
|
||||
; GCN: s_bfm_b64 exec, s1, 0
|
||||
; GCN: s_cmp_eq_u32 s1, 64
|
||||
; GCN: s_cmov_b64 exec, -1
|
||||
; GCN: v_add_i32_e32 v0, vcc, s0, v0
|
||||
define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
|
||||
%s = add i32 %a, %count
|
||||
%f = sitofp i32 %s to float
|
||||
ret float %f
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reuse_input2:
|
||||
; GCN: s_bfe_u32 s1, s0, 0x70013
|
||||
; GCN: s_bfm_b64 exec, s1, 0
|
||||
; GCN: s_cmp_eq_u32 s1, 64
|
||||
; GCN: s_cmov_b64 exec, -1
|
||||
; GCN: v_add_i32_e32 v0, vcc, s0, v0
|
||||
define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
|
||||
main_body:
|
||||
%s = add i32 %a, %count
|
||||
%f = sitofp i32 %s to float
|
||||
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
|
||||
ret float %f
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.init.exec(i64) #1
|
||||
declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
|
||||
|
||||
attributes #1 = { convergent }
|
Loading…
Reference in New Issue