diff --git a/llvm/lib/Target/R600/AMDGPU.h b/llvm/lib/Target/R600/AMDGPU.h index e2d1caf86767..feec1c539c52 100644 --- a/llvm/lib/Target/R600/AMDGPU.h +++ b/llvm/lib/Target/R600/AMDGPU.h @@ -29,6 +29,7 @@ FunctionPass *createR600VectorRegMerger(TargetMachine &tm); FunctionPass *createR600TextureIntrinsicsReplacer(); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm); +FunctionPass *createR600ClauseMergePass(TargetMachine &tm); FunctionPass *createR600Packetizer(TargetMachine &tm); FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm); diff --git a/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp b/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp index 2119ed363d91..66585e432e3c 100644 --- a/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -168,10 +168,11 @@ bool AMDGPUPassConfig::addPostRegAlloc() { bool AMDGPUPassConfig::addPreSched2() { const AMDGPUSubtarget &ST = TM->getSubtarget(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) addPass(createR600EmitClauseMarkers(*TM)); - } addPass(&IfConverterID); + if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + addPass(createR600ClauseMergePass(*TM)); return false; } diff --git a/llvm/lib/Target/R600/CMakeLists.txt b/llvm/lib/Target/R600/CMakeLists.txt index 658eeea81480..7bdfa7e3b1f1 100644 --- a/llvm/lib/Target/R600/CMakeLists.txt +++ b/llvm/lib/Target/R600/CMakeLists.txt @@ -28,6 +28,7 @@ add_llvm_target(R600CodeGen AMDGPUConvertToISA.cpp AMDGPUInstrInfo.cpp AMDGPURegisterInfo.cpp + R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp R600ExpandSpecialInstrs.cpp diff --git a/llvm/lib/Target/R600/R600ClauseMergePass.cpp b/llvm/lib/Target/R600/R600ClauseMergePass.cpp new file mode 100644 index 000000000000..33d2ca32577d --- /dev/null +++ b/llvm/lib/Target/R600/R600ClauseMergePass.cpp @@ -0,0 +1,204 @@ +//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. +/// This pass is merging consecutive CFAlus where applicable. +/// It needs to be called after IfCvt for best results. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "r600mergeclause" +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +static bool isCFAlu(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU: + case AMDGPU::CF_ALU_PUSH_BEFORE: + return true; + default: + return false; + } +} + +class R600ClauseMergePass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + unsigned getCFAluSize(const MachineInstr *MI) const; + bool isCFAluEnabled(const MachineInstr *MI) const; + + /// IfCvt pass can generate "disabled" ALU clause marker that need to be + /// removed and their content affected to the previous alu clause. + /// This function parse instructions after CFAlu untill it find a disabled + /// CFAlu and merge the content, or an enabled CFAlu. + void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; + + /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if + /// it is the case. + bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) + const; + +public: + R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const; +}; + +char R600ClauseMergePass::ID = 0; + +unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); +} + +bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); +} + +void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) + const { + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); + I++; + do { + while (I!= E && !isCFAlu(I)) + I++; + if (I == E) + return; + MachineInstr *MI = I++; + if (isCFAluEnabled(MI)) + break; + CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); + MI->eraseFromParent(); + } while (I != E); +} + +bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, + const MachineInstr *LatrCFAlu) const { + assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + unsigned RootInstCount = getCFAluSize(RootCFAlu), + LaterInstCount = getCFAluSize(LatrCFAlu); + unsigned CumuledInsts = RootInstCount + LaterInstCount; + if (CumuledInsts >= TII->getMaxAlusPerClause()) { + DEBUG(dbgs() << "Excess inst counts\n"); + return false; + } + if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return false; + // Is KCache Bank 0 compatible ? + int Mode0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); + int KBank0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); + int KBank0LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); + if (LatrCFAlu->getOperand(Mode0Idx).getImm() && + RootCFAlu->getOperand(Mode0Idx).getImm() && + (LatrCFAlu->getOperand(KBank0Idx).getImm() != + RootCFAlu->getOperand(KBank0Idx).getImm() || + LatrCFAlu->getOperand(KBank0LineIdx).getImm() != + RootCFAlu->getOperand(KBank0LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + // Is KCache Bank 1 compatible ? + int Mode1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); + int KBank1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); + int KBank1LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); + if (LatrCFAlu->getOperand(Mode1Idx).getImm() && + RootCFAlu->getOperand(Mode1Idx).getImm() && + (LatrCFAlu->getOperand(KBank1Idx).getImm() != + RootCFAlu->getOperand(KBank1Idx).getImm() || + LatrCFAlu->getOperand(KBank1LineIdx).getImm() != + RootCFAlu->getOperand(KBank1LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { + RootCFAlu->getOperand(Mode0Idx).setImm( + LatrCFAlu->getOperand(Mode0Idx).getImm()); + RootCFAlu->getOperand(KBank0Idx).setImm( + LatrCFAlu->getOperand(KBank0Idx).getImm()); + RootCFAlu->getOperand(KBank0LineIdx).setImm( + LatrCFAlu->getOperand(KBank0LineIdx).getImm()); + } + if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { + RootCFAlu->getOperand(Mode1Idx).setImm( + LatrCFAlu->getOperand(Mode1Idx).getImm()); + RootCFAlu->getOperand(KBank1Idx).setImm( + LatrCFAlu->getOperand(KBank1Idx).getImm()); + RootCFAlu->getOperand(KBank1LineIdx).setImm( + LatrCFAlu->getOperand(KBank1LineIdx).getImm()); + } + RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); + RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); + return true; +} + +bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getTarget().getInstrInfo()); + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator LatestCFAlu = E; + while (I != E) { + MachineInstr *MI = I++; + if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || + TII->mustBeLastInClause(MI->getOpcode())) + LatestCFAlu = E; + if (!isCFAlu(MI)) + continue; + cleanPotentialDisabledCFAlu(MI); + + if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { + MI->eraseFromParent(); + } else { + assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); + LatestCFAlu = MI; + } + } + } + return false; +} + +const char *R600ClauseMergePass::getPassName() const { + return "R600 Merge Clause Markers Pass"; +} + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { + return new R600ClauseMergePass(TM); +} diff --git a/llvm/lib/Target/R600/R600InstrInfo.cpp b/llvm/lib/Target/R600/R600InstrInfo.cpp index b2c2f93dfe87..ebf48cf01570 100644 --- a/llvm/lib/Target/R600/R600InstrInfo.cpp +++ b/llvm/lib/Target/R600/R600InstrInfo.cpp @@ -153,6 +153,24 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { (TargetFlags & R600_InstFlag::LDS_1A2D)); } +bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { + if (isALUInstr(MI->getOpcode())) + return true; + if (isVector(*MI) || isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT_4: + return true; + default: + return false; + } +} + bool R600InstrInfo::isTransOnly(unsigned Opcode) const { if (ST.hasCaymanISA()) return false; diff --git a/llvm/lib/Target/R600/R600InstrInfo.h b/llvm/lib/Target/R600/R600InstrInfo.h index 235a8751ac77..1a99f67b3455 100644 --- a/llvm/lib/Target/R600/R600InstrInfo.h +++ b/llvm/lib/Target/R600/R600InstrInfo.h @@ -66,6 +66,10 @@ namespace llvm { bool hasInstrModifiers(unsigned Opcode) const; bool isLDSInstr(unsigned Opcode) const; + /// \returns true if this \p Opcode represents an ALU instruction or an + /// instruction that will be lowered in ExpandSpecialInstrs Pass. + bool canBeConsideredALU(const MachineInstr *MI) const; + bool isTransOnly(unsigned Opcode) const; bool isTransOnly(const MachineInstr *MI) const; bool isVectorOnly(unsigned Opcode) const; diff --git a/llvm/lib/Target/R600/R600Instructions.td b/llvm/lib/Target/R600/R600Instructions.td index f38fa5dc5711..329b32788e65 100644 --- a/llvm/lib/Target/R600/R600Instructions.td +++ b/llvm/lib/Target/R600/R600Instructions.td @@ -590,6 +590,7 @@ i32imm:$COUNT, i32imm:$Enabled), let ALT_CONST = 0; let WHOLE_QUAD_MODE = 0; let BARRIER = 1; + let UseNamedOperandTable = 1; let Inst{31-0} = Word0; let Inst{63-32} = Word1; diff --git a/llvm/test/CodeGen/R600/jump-address.ll b/llvm/test/CodeGen/R600/jump-address.ll index a1c26d1ccb77..26c298b9d819 100644 --- a/llvm/test/CodeGen/R600/jump-address.ll +++ b/llvm/test/CodeGen/R600/jump-address.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; CHECK: JUMP @10 +; CHECK: JUMP @5 ; CHECK: EXPORT ; CHECK-NOT: EXPORT diff --git a/llvm/test/CodeGen/R600/loop-address.ll b/llvm/test/CodeGen/R600/loop-address.ll index acded22c3cee..b46d8e9dfb04 100644 --- a/llvm/test/CodeGen/R600/loop-address.ll +++ b/llvm/test/CodeGen/R600/loop-address.ll @@ -1,9 +1,9 @@ ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ;CHECK: ALU_PUSH -;CHECK: LOOP_START_DX10 @13 -;CHECK: LOOP_BREAK @12 -;CHECK: POP @12 +;CHECK: LOOP_START_DX10 @11 +;CHECK: LOOP_BREAK @10 +;CHECK: POP @10 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" target triple = "r600--" diff --git a/llvm/test/CodeGen/R600/selectcc-opt.ll b/llvm/test/CodeGen/R600/selectcc-opt.ll index 9bae4019084a..834c03069522 100644 --- a/llvm/test/CodeGen/R600/selectcc-opt.ll +++ b/llvm/test/CodeGen/R600/selectcc-opt.ll @@ -30,7 +30,8 @@ ENDIF: ; CHECK: @test_b ; CHECK: SET{{[GTEQN]+}}_DX10 -; CHECK: PRED_ +; CHECK-NEXT: PRED_ +; CHECK-NEXT: ALU clause starting define void @test_b(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.0