diff --git a/llvm/lib/Target/R600/AMDGPU.h b/llvm/lib/Target/R600/AMDGPU.h index 261075e1e95c..13379e7e2556 100644 --- a/llvm/lib/Target/R600/AMDGPU.h +++ b/llvm/lib/Target/R600/AMDGPU.h @@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); @@ -47,6 +48,9 @@ FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +void initializeSIFoldOperandsPass(PassRegistry &); +extern char &SIFoldOperandsID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; diff --git a/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp b/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp index b2cd988ad050..80142f0016a6 100644 --- a/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -159,6 +159,8 @@ bool AMDGPUPassConfig::addInstSelector() { addPass(createSIFixSGPRCopiesPass(*TM)); } + addPass(createSILowerI1CopiesPass()); + addPass(createSIFoldOperandsPass()); return false; } diff --git a/llvm/lib/Target/R600/CMakeLists.txt b/llvm/lib/Target/R600/CMakeLists.txt index ed0a21684dc8..3b703e72943e 100644 --- a/llvm/lib/Target/R600/CMakeLists.txt +++ b/llvm/lib/Target/R600/CMakeLists.txt @@ -43,6 +43,7 @@ add_llvm_target(R600CodeGen SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp SIFixSGPRLiveRanges.cpp + SIFoldOperands.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp diff --git a/llvm/lib/Target/R600/SIFoldOperands.cpp b/llvm/lib/Target/R600/SIFoldOperands.cpp new file mode 100644 index 000000000000..761e8665e3a5 --- /dev/null +++ b/llvm/lib/Target/R600/SIFoldOperands.cpp @@ -0,0 +1,202 @@ +//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-fold-operands" +using namespace llvm; + +namespace { + +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fold Operands"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) + +char SIFoldOperands::ID = 0; + +char &llvm::SIFoldOperandsID = SIFoldOperands::ID; + +FunctionPass *llvm::createSIFoldOperandsPass() { + return new SIFoldOperands(); +} + +static bool isSafeToFold(unsigned Opcode) { + switch(Opcode) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + +static bool updateOperand(MachineInstr *MI, unsigned OpNo, + const MachineOperand &New, + const TargetRegisterInfo &TRI) { + MachineOperand &Old = MI->getOperand(OpNo); + assert(Old.isReg()); + + if (New.isImm()) { + Old.ChangeToImmediate(New.getImm()); + return true; + } + + if (New.isFPImm()) { + Old.ChangeToFPImmediate(New.getFPImm()); + return true; + } + + if (New.isReg()) { + if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && + TargetRegisterInfo::isVirtualRegister(New.getReg())) { + Old.substVirtReg(New.getReg(), New.getSubReg(), TRI); + return true; + } + } + + // FIXME: Handle physical registers. + + return false; +} + +bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (!isSafeToFold(MI.getOpcode())) + continue; + + MachineOperand &OpToFold = MI.getOperand(1); + + // FIXME: Fold operands with subregs. + if (OpToFold.isReg() && + (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || + OpToFold.getSubReg())) + continue; + + std::vector> FoldList; + for (MachineRegisterInfo::use_iterator + Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); + Use != E; ++Use) { + + MachineInstr *UseMI = Use->getParent(); + const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && UseOp.getSubReg()) { + continue; + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if ((OpToFold.isImm() || OpToFold.isFPImm()) && + UseMI->getOpcode() == AMDGPU::COPY) { + const TargetRegisterClass *TRC = + MRI.getRegClass(UseMI->getOperand(0).getReg()); + + if (TRC->getSize() == 4) { + if (TRI.isSGPRClass(TRC)) + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + else + UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); + } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) { + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64)); + } else { + continue; + } + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) + continue; + + // Normal substitution + if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &OpToFold)) { + FoldList.push_back(std::make_pair(UseMI, Use.getOperandNo())); + continue; + } + + // FIXME: We could commute the instruction to create more opportunites + // for folding. This will only be useful if we have 32-bit instructions. + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + } + + for (std::pair Fold : FoldList) { + if (updateOperand(Fold.first, Fold.second, OpToFold, TRI)) { + // Clear kill flags. + if (OpToFold.isReg()) + OpToFold.setIsKill(false); + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + Fold.second << " of " << *Fold.first << '\n'); + } + } + } + } + return false; +} diff --git a/llvm/test/CodeGen/R600/extload.ll b/llvm/test/CodeGen/R600/extload.ll index 5bda8f8fc7b3..10a307fda8bb 100644 --- a/llvm/test/CodeGen/R600/extload.ll +++ b/llvm/test/CodeGen/R600/extload.ll @@ -87,10 +87,9 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1) } ; FUNC-LABEL: {{^}}zextload_global_i8_to_i64: -; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}} -; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]], -; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]] -; SI: buffer_store_dwordx2 +; SI: buffer_load_ubyte v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { %a = load i8 addrspace(1)* %in, align 8 %ext = zext i8 %a to i64 @@ -99,10 +98,9 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* } ; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: -; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}} -; SI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]], -; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]] -; SI: buffer_store_dwordx2 +; SI: buffer_load_ushort v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { %a = load i16 addrspace(1)* %in, align 8 %ext = zext i16 %a to i64 @@ -111,10 +109,9 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) } ; FUNC-LABEL: {{^}}zextload_global_i32_to_i64: -; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}} -; SI-DAG: buffer_load_dword [[LOAD:v[0-9]+]], -; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]] -; SI: buffer_store_dwordx2 +; SI: buffer_load_dword v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %a = load i32 addrspace(1)* %in, align 8 %ext = zext i32 %a to i64 diff --git a/llvm/test/CodeGen/R600/local-atomics.ll b/llvm/test/CodeGen/R600/local-atomics.ll index 2ac811f26d8d..e9baa080670a 100644 --- a/llvm/test/CodeGen/R600/local-atomics.ll +++ b/llvm/test/CodeGen/R600/local-atomics.ll @@ -69,8 +69,7 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32: ; EG: LDS_ADD_RET * -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0] ; SI: s_endpgm define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -81,8 +80,7 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: ; EG: LDS_ADD_RET * -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 ; SI: s_endpgm define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -129,8 +127,7 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32: ; EG: LDS_SUB_RET * -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0] ; SI: s_endpgm define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -141,8 +138,7 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: ; EG: LDS_SUB_RET * -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 ; SI: s_endpgm define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -361,8 +357,7 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 } ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32: -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] [M0] ; SI: s_endpgm define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -371,8 +366,7 @@ define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { } ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 ; SI: s_endpgm define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -411,8 +405,7 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { } ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32: -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] ; SI: s_endpgm define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -421,8 +414,7 @@ define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { } ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1 -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]] +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 ; SI: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 ; SI: s_endpgm define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { diff --git a/llvm/test/CodeGen/R600/operand-folding.ll b/llvm/test/CodeGen/R600/operand-folding.ll new file mode 100644 index 000000000000..05177b475b15 --- /dev/null +++ b/llvm/test/CodeGen/R600/operand-folding.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}fold_sgpr: +; CHECK: v_add_i32_e32 v{{[0-9]+}}, s +define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { +entry: + %tmp0 = icmp ne i32 %fold, 0 + br i1 %tmp0, label %if, label %endif + +if: + %id = call i32 @llvm.r600.read.tidig.x() + %offset = add i32 %fold, %id + %tmp1 = getelementptr i32 addrspace(1)* %out, i32 %offset + store i32 0, i32 addrspace(1)* %tmp1 + br label %endif + +endif: + ret void +} + +; CHECK-LABEL: {{^}}fold_imm: +; CHECK v_or_i32_e32 v{{[0-9]+}}, 5 +define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { +entry: + %fold = add i32 3, 2 + %tmp0 = icmp ne i32 %cmp, 0 + br i1 %tmp0, label %if, label %endif + +if: + %id = call i32 @llvm.r600.read.tidig.x() + %val = or i32 %id, %fold + store i32 %val, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +attributes #0 = { readnone }