From 2d82590f640e4c014d7908b330d3681c951b1dcb Mon Sep 17 00:00:00 2001 From: Marek Olsak Date: Fri, 28 Apr 2017 20:21:58 +0000 Subject: [PATCH] AMDGPU: Add new amdgcn.init.exec intrinsics v2: More tests, bug fixes, cosmetic changes. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D31762 llvm-svn: 301677 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 ++++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 + llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 9 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 65 +++++++++++++++ llvm/lib/Target/AMDGPU/SIInstructions.td | 23 ++++++ .../CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll | 80 +++++++++++++++++++ 7 files changed, 196 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 21d8a15e7e7a..1249c1f846f2 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -108,6 +108,21 @@ def int_amdgcn_implicit_buffer_ptr : GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; +// Set EXEC to the 64-bit value given. +// This is always moved to the beginning of the basic block. +def int_amdgcn_init_exec : Intrinsic<[], + [llvm_i64_ty], // 64-bit literal constant + [IntrConvergent]>; + +// Set EXEC according to a thread count packed in an SGPR input: +// thread_count = (input >> bitoffset) & 0x7f; +// This is always moved to the beginning of the basic block. +def int_amdgcn_init_exec_from_input : Intrinsic<[], + [llvm_i32_ty, // 32-bit SGPR input + llvm_i32_ty], // bit offset of the thread count + [IntrConvergent]>; + + //===----------------------------------------------------------------------===// // Instruction Intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 29ded85ad0ab..82f792c343b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3516,6 +3516,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(INIT_EXEC) + NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(SENDMSGHALT) NODE_NAME_CASE(INTERP_MOV) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index de050196d92e..7a78a8253f4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -369,6 +369,8 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, + INIT_EXEC, + INIT_EXEC_FROM_INPUT, SENDMSG, SENDMSGHALT, INTERP_MOV, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index c1706d12a2ea..353cc5742791 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -299,6 +299,15 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT", + SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisInt<1>]>, + [SDNPHasChain, SDNPInGlue]>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 41043804b4a2..dd1232c77f80 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1957,6 +1957,63 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; + case AMDGPU::SI_INIT_EXEC: + // This should be before all vector instructions. + BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::EXEC) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + return BB; + + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + MachineInstr *FirstMI = &*BB->begin(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned InputReg = MI.getOperand(0).getReg(); + unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + bool Found = false; + + // Move the COPY of the input reg to the beginning, so that we can use it. + for (auto I = BB->begin(); I != &MI; I++) { + if (I->getOpcode() != TargetOpcode::COPY || + I->getOperand(0).getReg() != InputReg) + continue; + + if (I == FirstMI) { + FirstMI = &*++BB->begin(); + } else { + I->removeFromParent(); + BB->insert(FirstMI, &*I); + } + Found = true; + break; + } + assert(Found); + + // This should be before all vector instructions. + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), + AMDGPU::EXEC) + .addReg(CountReg) + .addImm(0); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(64); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), + AMDGPU::EXEC) + .addImm(-1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::GET_GROUPSTATICSIZE: { DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) @@ -3224,6 +3281,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(NodeOp, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_init_exec: { + return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, + Op.getOperand(2)); + } + case Intrinsic::amdgcn_init_exec_from_input: { + return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, + Op.getOperand(2), Op.getOperand(3)); + } case AMDGPUIntrinsic::SI_tbuffer_store: { SDValue Ops[] = { Chain, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3f6ddec70479..7ccb54f54e34 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -286,6 +286,19 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { let isReMaterializable = 1; } +def SI_INIT_EXEC : SPseudoInstSI < + (outs), (ins i64imm:$src), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; +} + +def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < + (outs), (ins SSrc_b32:$input, i32imm:$shift), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; +} + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -399,6 +412,16 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < } // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { +def : Pat < + (AMDGPUinit_exec i64:$src), + (SI_INIT_EXEC (as_i64imm $src)) +>; + +def : Pat < + (AMDGPUinit_exec_from_input i32:$input, i32:$shift), + (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) +>; + def : Pat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll new file mode 100644 index 000000000000..617f1f19e360 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll @@ -0,0 +1,80 @@ +;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN + +; GCN-LABEL: {{^}}full_mask: +; GCN: s_mov_b64 exec, -1 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @full_mask(float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec(i64 -1) + ret float %s +} + +; GCN-LABEL: {{^}}partial_mask: +; GCN: s_mov_b64 exec, 0x1e240 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @partial_mask(float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec(i64 123456) + ret float %s +} + +; GCN-LABEL: {{^}}input_s3off8: +; GCN: s_bfe_u32 s0, s3, 0x70008 +; GCN: s_bfm_b64 exec, s0, 0 +; GCN: s_cmp_eq_u32 s0, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) + ret float %s +} + +; GCN-LABEL: {{^}}input_s0off19: +; GCN: s_bfe_u32 s0, s0, 0x70013 +; GCN: s_bfm_b64 exec, s0, 0 +; GCN: s_cmp_eq_u32 s0, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_f32_e32 v0, +define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) { +main_body: + %s = fadd float %a, %b + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) + ret float %s +} + +; GCN-LABEL: {{^}}reuse_input: +; GCN: s_bfe_u32 s1, s0, 0x70013 +; GCN: s_bfm_b64 exec, s1, 0 +; GCN: s_cmp_eq_u32 s1, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_i32_e32 v0, vcc, s0, v0 +define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { +main_body: + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) + %s = add i32 %a, %count + %f = sitofp i32 %s to float + ret float %f +} + +; GCN-LABEL: {{^}}reuse_input2: +; GCN: s_bfe_u32 s1, s0, 0x70013 +; GCN: s_bfm_b64 exec, s1, 0 +; GCN: s_cmp_eq_u32 s1, 64 +; GCN: s_cmov_b64 exec, -1 +; GCN: v_add_i32_e32 v0, vcc, s0, v0 +define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { +main_body: + %s = add i32 %a, %count + %f = sitofp i32 %s to float + call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) + ret float %f +} + +declare void @llvm.amdgcn.init.exec(i64) #1 +declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1 + +attributes #1 = { convergent }