forked from OSchip/llvm-project
AMDGPU/SI: Detect uniform branches and emit s_cbranch instructions
Reviewers: arsenm Subscribers: mareko, MatzeB, qcolombet, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D16603 llvm-svn: 260765
This commit is contained in:
parent
0de36ec169
commit
bc4497b13c
|
@ -78,6 +78,7 @@ FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
|
|||
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
|
||||
extern char &AMDGPUPromoteAllocaID;
|
||||
|
||||
FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST);
|
||||
Pass *createAMDGPUStructurizeCFGPass();
|
||||
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
|
||||
ModulePass *createAMDGPUAlwaysInlinePass();
|
||||
|
|
|
@ -43,6 +43,7 @@ public:
|
|||
AU.setPreservesAll();
|
||||
}
|
||||
|
||||
void visitBranchInst(BranchInst &I);
|
||||
void visitLoadInst(LoadInst &I);
|
||||
|
||||
};
|
||||
|
@ -57,13 +58,28 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
|
|||
|
||||
char AMDGPUAnnotateUniformValues::ID = 0;
|
||||
|
||||
static void setUniformMetadata(Instruction *I) {
|
||||
I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
|
||||
}
|
||||
|
||||
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
|
||||
if (I.isUnconditional())
|
||||
return;
|
||||
|
||||
Value *Cond = I.getCondition();
|
||||
if (!DA->isUniform(Cond))
|
||||
return;
|
||||
|
||||
setUniformMetadata(I.getParent()->getTerminator());
|
||||
}
|
||||
|
||||
void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
|
||||
Value *Ptr = I.getPointerOperand();
|
||||
if (!DA->isUniform(Ptr))
|
||||
return;
|
||||
|
||||
if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
|
||||
PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
|
||||
setUniformMetadata(PtrI);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPUInstrInfo.h"
|
||||
#include "AMDGPUIntrinsicInfo.h"
|
||||
#include "AMDGPUISelLowering.h" // For AMDGPUISD
|
||||
#include "AMDGPURegisterInfo.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
|
@ -36,6 +37,20 @@ using namespace llvm;
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace {
|
||||
|
||||
static bool isCBranchSCC(const SDNode *N) {
|
||||
assert(N->getOpcode() == ISD::BRCOND);
|
||||
if (!N->hasOneUse())
|
||||
return false;
|
||||
|
||||
SDValue Cond = N->getOperand(1);
|
||||
if (Cond.getOpcode() == ISD::CopyToReg)
|
||||
Cond = Cond.getOperand(2);
|
||||
return Cond.getOpcode() == ISD::SETCC &&
|
||||
Cond.getOperand(0).getValueType() == MVT::i32 &&
|
||||
Cond.hasOneUse();
|
||||
}
|
||||
|
||||
/// AMDGPU specific code to select AMDGPU machine instructions for
|
||||
/// SelectionDAG operations.
|
||||
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
|
||||
|
@ -82,6 +97,8 @@ private:
|
|||
bool isLocalLoad(const LoadSDNode *N) const;
|
||||
bool isRegionLoad(const LoadSDNode *N) const;
|
||||
|
||||
bool isUniformBr(const SDNode *N) const;
|
||||
|
||||
SDNode *glueCopyToM0(SDNode *N) const;
|
||||
|
||||
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
|
||||
|
@ -143,6 +160,7 @@ private:
|
|||
uint32_t Offset, uint32_t Width);
|
||||
SDNode *SelectS_BFEFromShifts(SDNode *N);
|
||||
SDNode *SelectS_BFE(SDNode *N);
|
||||
SDNode *SelectBRCOND(SDNode *N);
|
||||
|
||||
// Include the pieces autogenerated from the target description.
|
||||
#include "AMDGPUGenDAGISel.inc"
|
||||
|
@ -509,6 +527,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
|||
break;
|
||||
|
||||
return SelectS_BFE(N);
|
||||
case ISD::BRCOND:
|
||||
return SelectBRCOND(N);
|
||||
}
|
||||
|
||||
return SelectCode(N);
|
||||
|
@ -623,6 +643,11 @@ bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
|
||||
const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
|
||||
return BB->getTerminator()->getMetadata("amdgpu.uniform");
|
||||
}
|
||||
|
||||
const char *AMDGPUDAGToDAGISel::getPassName() const {
|
||||
return "AMDGPU DAG->DAG Pattern Instruction Selection";
|
||||
}
|
||||
|
@ -1365,6 +1390,36 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
|
|||
return SelectCode(N);
|
||||
}
|
||||
|
||||
SDNode *AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
|
||||
SDValue Cond = N->getOperand(1);
|
||||
|
||||
if (isCBranchSCC(N)) {
|
||||
// This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
|
||||
return SelectCode(N);
|
||||
}
|
||||
|
||||
// The result of VOPC instructions is or'd against ~EXEC before it is
|
||||
// written to vcc or another SGPR. This means that the value '1' is always
|
||||
// written to the corresponding bit for results that are masked. In order
|
||||
// to correctly check against vccz, we need to and VCC with the EXEC
|
||||
// register in order to clear the value from the masked bits.
|
||||
|
||||
SDLoc SL(N);
|
||||
|
||||
SDNode *MaskedCond =
|
||||
CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
|
||||
CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
|
||||
Cond);
|
||||
SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC,
|
||||
SDValue(MaskedCond, 0),
|
||||
SDValue()); // Passing SDValue() adds a
|
||||
// glue output.
|
||||
return CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
|
||||
N->getOperand(2), // Basic Block
|
||||
VCC.getValue(0), // Chain
|
||||
VCC.getValue(1)); // Glue
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const {
|
||||
|
||||
|
|
|
@ -240,10 +240,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
|
|||
|
||||
bool
|
||||
AMDGPUPassConfig::addPreISel() {
|
||||
const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
|
||||
addPass(createFlattenCFGPass());
|
||||
if (ST.IsIRStructurizerEnabled())
|
||||
addPass(createStructurizeCFGPass());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -263,6 +260,9 @@ bool AMDGPUPassConfig::addGCPasses() {
|
|||
|
||||
bool R600PassConfig::addPreISel() {
|
||||
AMDGPUPassConfig::addPreISel();
|
||||
const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
|
||||
if (ST.IsIRStructurizerEnabled())
|
||||
addPass(createStructurizeCFGPass());
|
||||
addPass(createR600TextureIntrinsicsReplacer());
|
||||
return false;
|
||||
}
|
||||
|
@ -301,11 +301,11 @@ bool GCNPassConfig::addPreISel() {
|
|||
// FIXME: We need to run a pass to propagate the attributes when calls are
|
||||
// supported.
|
||||
addPass(&AMDGPUAnnotateKernelFeaturesID);
|
||||
|
||||
addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
|
||||
addPass(createSinkingPass());
|
||||
addPass(createSITypeRewriter());
|
||||
addPass(createSIAnnotateControlFlowPass());
|
||||
addPass(createAMDGPUAnnotateUniformValues());
|
||||
addPass(createSIAnnotateControlFlowPass());
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "llvm/Analysis/ValueTracking.h"
|
||||
#include "llvm/CodeGen/BasicTTIImpl.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Intrinsics.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Target/CostTable.h"
|
||||
#include "llvm/Target/TargetLowering.h"
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "AMDGPU.h"
|
||||
#include "llvm/ADT/DepthFirstIterator.h"
|
||||
#include "llvm/Analysis/DivergenceAnalysis.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Dominators.h"
|
||||
|
@ -43,6 +44,7 @@ static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
|
|||
static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
|
||||
|
||||
class SIAnnotateControlFlow : public FunctionPass {
|
||||
DivergenceAnalysis *DA;
|
||||
|
||||
Type *Boolean;
|
||||
Type *Void;
|
||||
|
@ -105,6 +107,7 @@ public:
|
|||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<DivergenceAnalysis>();
|
||||
AU.addPreserved<DominatorTreeWrapperPass>();
|
||||
FunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
@ -115,6 +118,7 @@ public:
|
|||
|
||||
INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
|
||||
"Annotate SI Control Flow", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
|
||||
INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
|
||||
"Annotate SI Control Flow", false, false)
|
||||
|
||||
|
@ -200,6 +204,9 @@ void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
|
|||
|
||||
/// \brief Open a new "If" block
|
||||
void SIAnnotateControlFlow::openIf(BranchInst *Term) {
|
||||
if (DA->isUniform(Term->getCondition())) {
|
||||
return;
|
||||
}
|
||||
Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
|
||||
Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
|
||||
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
|
||||
|
@ -207,6 +214,9 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) {
|
|||
|
||||
/// \brief Close the last "If" block and open a new "Else" block
|
||||
void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
|
||||
if (DA->isUniform(Term->getCondition())) {
|
||||
return;
|
||||
}
|
||||
Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
|
||||
Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
|
||||
push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
|
||||
|
@ -290,6 +300,10 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
|
|||
|
||||
/// \brief Handle a back edge (loop)
|
||||
void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
|
||||
if (DA->isUniform(Term->getCondition())) {
|
||||
return;
|
||||
}
|
||||
|
||||
BasicBlock *BB = Term->getParent();
|
||||
llvm::Loop *L = LI->getLoopFor(BB);
|
||||
BasicBlock *Target = Term->getSuccessor(1);
|
||||
|
@ -311,6 +325,9 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
|
|||
void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
|
||||
llvm::Loop *L = LI->getLoopFor(BB);
|
||||
|
||||
if (Stack.back().first != BB)
|
||||
return;
|
||||
|
||||
if (L && L->getHeader() == BB) {
|
||||
// We can't insert an EndCF call into a loop header, because it will
|
||||
// get executed on every iteration of the loop, when it should be
|
||||
|
@ -326,14 +343,18 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
|
|||
BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
|
||||
}
|
||||
|
||||
CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt());
|
||||
Value *Exec = popSaved();
|
||||
if (!isa<UndefValue>(Exec))
|
||||
CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
|
||||
}
|
||||
|
||||
/// \brief Annotate the control flow with intrinsics so the backend can
|
||||
/// recognize if/then/else and loops.
|
||||
bool SIAnnotateControlFlow::runOnFunction(Function &F) {
|
||||
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
DA = &getAnalysis<DivergenceAnalysis>();
|
||||
|
||||
for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
|
||||
E = df_end(&F.getEntryBlock()); I != E; ++I) {
|
||||
|
@ -343,12 +364,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
|
|||
if (!Term || Term->isUnconditional()) {
|
||||
if (isTopOfStack(*I))
|
||||
closeControlFlow(*I);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (I.nodeVisited(Term->getSuccessor(1))) {
|
||||
if (isTopOfStack(*I))
|
||||
closeControlFlow(*I);
|
||||
|
||||
handleLoop(Term);
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -108,9 +108,20 @@ FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
|
|||
return new SIFixSGPRLiveRanges();
|
||||
}
|
||||
|
||||
static bool hasOnlyScalarBr(const MachineBasicBlock *MBB,
|
||||
const SIInstrInfo *TII) {
|
||||
for (MachineBasicBlock::const_iterator I = MBB->getFirstTerminator(),
|
||||
E = MBB->end(); I != E; ++I) {
|
||||
if (!TII->isSOPP(*I))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
|
||||
const SIInstrInfo *TII =
|
||||
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
|
||||
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
|
||||
MF.getSubtarget().getRegisterInfo());
|
||||
bool MadeChange = false;
|
||||
|
@ -147,7 +158,7 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
|
|||
}
|
||||
}
|
||||
|
||||
if (MBB->succ_size() < 2)
|
||||
if (MBB->succ_size() < 2 || hasOnlyScalarBr(MBB, TII))
|
||||
continue;
|
||||
|
||||
// We have structured control flow, so the number of successors should be
|
||||
|
|
|
@ -130,6 +130,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
|
|||
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
|
||||
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
|
||||
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::i64, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
|
||||
|
||||
for (MVT VT : MVT::integer_valuetypes()) {
|
||||
if (VT == MVT::i64)
|
||||
|
@ -1192,6 +1196,23 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
|
|||
DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
|
||||
}
|
||||
|
||||
bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
|
||||
if (!Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN)
|
||||
return false;
|
||||
|
||||
switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
|
||||
default: return false;
|
||||
case AMDGPUIntrinsic::amdgcn_if:
|
||||
case AMDGPUIntrinsic::amdgcn_else:
|
||||
case AMDGPUIntrinsic::amdgcn_break:
|
||||
case AMDGPUIntrinsic::amdgcn_if_break:
|
||||
case AMDGPUIntrinsic::amdgcn_else_break:
|
||||
case AMDGPUIntrinsic::amdgcn_loop:
|
||||
case AMDGPUIntrinsic::amdgcn_end_cf:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/// This transforms the control flow intrinsics to get the branch destination as
|
||||
/// last parameter, also switches branch target with BR if the need arise
|
||||
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
||||
|
@ -1202,13 +1223,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|||
SDNode *Intr = BRCOND.getOperand(1).getNode();
|
||||
SDValue Target = BRCOND.getOperand(2);
|
||||
SDNode *BR = nullptr;
|
||||
SDNode *SetCC = nullptr;
|
||||
|
||||
if (Intr->getOpcode() == ISD::SETCC) {
|
||||
// As long as we negate the condition everything is fine
|
||||
SDNode *SetCC = Intr;
|
||||
assert(SetCC->getConstantOperandVal(1) == 1);
|
||||
assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
|
||||
ISD::SETNE);
|
||||
SetCC = Intr;
|
||||
Intr = SetCC->getOperand(0).getNode();
|
||||
|
||||
} else {
|
||||
|
@ -1217,7 +1236,16 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|||
Target = BR->getOperand(1);
|
||||
}
|
||||
|
||||
assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
|
||||
if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) {
|
||||
// This is a uniform branch so we don't need to legalize.
|
||||
return BRCOND;
|
||||
}
|
||||
|
||||
assert(!SetCC ||
|
||||
(SetCC->getConstantOperandVal(1) == 1 &&
|
||||
isCFIntrinsic(Intr) &&
|
||||
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
|
||||
ISD::SETNE));
|
||||
|
||||
// Build the result and
|
||||
ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
|
||||
|
|
|
@ -60,6 +60,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
|
|||
|
||||
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
|
||||
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
|
||||
|
||||
bool isCFIntrinsic(const SDNode *Intr) const;
|
||||
public:
|
||||
SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
|
||||
|
||||
|
|
|
@ -1437,6 +1437,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
|
|||
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
|
||||
int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
|
||||
|
||||
// Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes
|
||||
// all SCC users are in the same blocks as their defs.
|
||||
const MachineBasicBlock *MBB = MI->getParent();
|
||||
if (MI == &MBB->front()) {
|
||||
if (MBB->isLiveIn(AMDGPU::SCC)) {
|
||||
ErrInfo = "scc register cannot be live across blocks.";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure the number of operands is correct.
|
||||
const MCInstrDesc &Desc = get(Opcode);
|
||||
if (!Desc.isVariadic() &&
|
||||
|
@ -1605,6 +1615,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
|
|||
case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
|
||||
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
|
||||
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
|
||||
case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
|
||||
case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
|
||||
case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
|
||||
case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
|
||||
case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
|
||||
case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
|
||||
case AMDGPU::S_LOAD_DWORD_IMM:
|
||||
case AMDGPU::S_LOAD_DWORD_SGPR:
|
||||
case AMDGPU::S_LOAD_DWORD_IMM_ci:
|
||||
|
@ -1621,6 +1637,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
|
|||
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
|
||||
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
|
||||
case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
|
||||
case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
|
||||
case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1979,7 +1997,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
|
|||
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
|
||||
|
||||
// Legalize VOP2
|
||||
if (isVOP2(*MI)) {
|
||||
if (isVOP2(*MI) || isVOPC(*MI)) {
|
||||
legalizeOperandsVOP2(MRI, MI);
|
||||
return;
|
||||
}
|
||||
|
@ -2568,6 +2586,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
|
|||
Inst->eraseFromParent();
|
||||
continue;
|
||||
|
||||
case AMDGPU::S_CBRANCH_SCC0:
|
||||
case AMDGPU::S_CBRANCH_SCC1:
|
||||
// Clear unused bits of vcc
|
||||
BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
.addReg(AMDGPU::VCC);
|
||||
break;
|
||||
|
||||
case AMDGPU::S_BFE_U64:
|
||||
case AMDGPU::S_BFM_B64:
|
||||
llvm_unreachable("Moving this op to VALU not implemented");
|
||||
|
@ -2589,8 +2615,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
|
|||
// both.
|
||||
for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
|
||||
MachineOperand &Op = Inst->getOperand(i);
|
||||
if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
|
||||
if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
|
||||
Inst->RemoveOperand(i);
|
||||
addSCCDefUsersToVALUWorklist(Inst, Worklist);
|
||||
}
|
||||
}
|
||||
|
||||
if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
|
||||
|
@ -2623,19 +2651,24 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
|
|||
Inst->addOperand(MachineOperand::CreateImm(BitWidth));
|
||||
}
|
||||
|
||||
// Update the destination register class.
|
||||
const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
|
||||
if (!NewDstRC)
|
||||
continue;
|
||||
bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef();
|
||||
unsigned NewDstReg = AMDGPU::NoRegister;
|
||||
if (HasDst) {
|
||||
// Update the destination register class.
|
||||
const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
|
||||
if (!NewDstRC)
|
||||
continue;
|
||||
|
||||
unsigned DstReg = Inst->getOperand(0).getReg();
|
||||
unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
|
||||
MRI.replaceRegWith(DstReg, NewDstReg);
|
||||
unsigned DstReg = Inst->getOperand(0).getReg();
|
||||
NewDstReg = MRI.createVirtualRegister(NewDstRC);
|
||||
MRI.replaceRegWith(DstReg, NewDstReg);
|
||||
}
|
||||
|
||||
// Legalize the operands
|
||||
legalizeOperands(Inst);
|
||||
|
||||
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
|
||||
if (HasDst)
|
||||
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2910,6 +2943,22 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
|
|||
}
|
||||
}
|
||||
|
||||
void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst,
|
||||
SmallVectorImpl<MachineInstr *> &Worklist) const {
|
||||
// This assumes that all the users of SCC are in the same block
|
||||
// as the SCC def.
|
||||
for (MachineBasicBlock::iterator I = SCCDefInst,
|
||||
E = SCCDefInst->getParent()->end(); I != E; ++I) {
|
||||
|
||||
// Exit if we find another SCC def.
|
||||
if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
|
||||
return;
|
||||
|
||||
if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
|
||||
Worklist.push_back(I);
|
||||
}
|
||||
}
|
||||
|
||||
const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
|
||||
const MachineInstr &Inst) const {
|
||||
const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
|
||||
|
|
|
@ -59,6 +59,9 @@ private:
|
|||
unsigned Reg, MachineRegisterInfo &MRI,
|
||||
SmallVectorImpl<MachineInstr *> &Worklist) const;
|
||||
|
||||
void addSCCDefUsersToVALUWorklist(
|
||||
MachineInstr *SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const;
|
||||
|
||||
const TargetRegisterClass *
|
||||
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
|
||||
|
||||
|
|
|
@ -247,6 +247,30 @@ def si_truncstore_local_i16 : PatFrag <
|
|||
return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
|
||||
}]>;
|
||||
|
||||
def si_setcc_uniform : PatFrag <
|
||||
(ops node:$lhs, node:$rhs, node:$cond),
|
||||
(setcc node:$lhs, node:$rhs, node:$cond), [{
|
||||
for (SDNode *Use : N->uses()) {
|
||||
if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
|
||||
return false;
|
||||
|
||||
unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
|
||||
if (Reg != AMDGPU::SCC)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}]>;
|
||||
|
||||
def si_uniform_br : PatFrag <
|
||||
(ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{
|
||||
return isUniformBr(N);
|
||||
}]>;
|
||||
|
||||
def si_uniform_br_scc : PatFrag <
|
||||
(ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{
|
||||
return isCBranchSCC(N);
|
||||
}]>;
|
||||
|
||||
multiclass SIAtomicM0Glue2 <string op_name> {
|
||||
|
||||
def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2,
|
||||
|
@ -826,7 +850,8 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
|
|||
class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
|
||||
string opName, PatLeaf cond> : SOPC <
|
||||
op, (outs), (ins rc:$src0, rc:$src1),
|
||||
opName#" $src0, $src1", []> {
|
||||
opName#" $src0, $src1",
|
||||
[(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
|
||||
let Defs = [SCC];
|
||||
}
|
||||
|
||||
|
|
|
@ -336,18 +336,18 @@ defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
|
|||
// SOPC Instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">;
|
||||
def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">;
|
||||
def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">;
|
||||
def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">;
|
||||
def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">;
|
||||
def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">;
|
||||
def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">;
|
||||
def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">;
|
||||
def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">;
|
||||
def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">;
|
||||
def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">;
|
||||
def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
|
||||
def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>;
|
||||
def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32", COND_NE>;
|
||||
def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>;
|
||||
def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>;
|
||||
def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>;
|
||||
def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32", COND_SLE>;
|
||||
def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>;
|
||||
def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32", COND_NE >;
|
||||
def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>;
|
||||
def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>;
|
||||
def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>;
|
||||
def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>;
|
||||
////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>;
|
||||
////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>;
|
||||
////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>;
|
||||
|
@ -449,7 +449,8 @@ def S_CBRANCH_SCC0 : SOPP <
|
|||
>;
|
||||
def S_CBRANCH_SCC1 : SOPP <
|
||||
0x00000005, (ins sopp_brtarget:$simm16),
|
||||
"s_cbranch_scc1 $simm16"
|
||||
"s_cbranch_scc1 $simm16",
|
||||
[(si_uniform_br_scc SCC, bb:$simm16)]
|
||||
>;
|
||||
} // End Uses = [SCC]
|
||||
|
||||
|
@ -2130,7 +2131,7 @@ def : Pat <
|
|||
def : Pat <
|
||||
(i64 (ctpop i64:$src)),
|
||||
(i64 (REG_SEQUENCE SReg_64,
|
||||
(S_BCNT1_I32_B64 $src), sub0,
|
||||
(i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
|
||||
(S_MOV_B32 0), sub1))
|
||||
>;
|
||||
|
||||
|
@ -3030,10 +3031,12 @@ def : ZExt_i64_i32_Pat<anyext>;
|
|||
def : ZExt_i64_i1_Pat<zext>;
|
||||
def : ZExt_i64_i1_Pat<anyext>;
|
||||
|
||||
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
|
||||
// REG_SEQUENCE patterns don't support instructions with multiple outputs.
|
||||
def : Pat <
|
||||
(i64 (sext i32:$src)),
|
||||
(REG_SEQUENCE SReg_64, $src, sub0,
|
||||
(S_ASHR_I32 $src, 31), sub1)
|
||||
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SGPR_32)), sub1)
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
|
|
|
@ -427,7 +427,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
|
|||
&AMDGPU::VReg_256RegClass,
|
||||
&AMDGPU::SReg_256RegClass,
|
||||
&AMDGPU::VReg_512RegClass,
|
||||
&AMDGPU::SReg_512RegClass
|
||||
&AMDGPU::SReg_512RegClass,
|
||||
&AMDGPU::SCC_CLASSRegClass,
|
||||
};
|
||||
|
||||
for (const TargetRegisterClass *BaseClass : BaseClasses) {
|
||||
|
@ -442,6 +443,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
|
|||
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
|
||||
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
|
||||
switch (RC->getSize()) {
|
||||
case 0: return false;
|
||||
case 1: return false;
|
||||
case 4:
|
||||
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
|
||||
case 8:
|
||||
|
|
|
@ -81,6 +81,11 @@ foreach Index = 0-255 in {
|
|||
// Groupings using register classes and tuples
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
|
||||
let CopyCost = -1;
|
||||
let isAllocatable = 0;
|
||||
}
|
||||
|
||||
// TODO: Do we need to set DwarfRegAlias on register tuples?
|
||||
|
||||
// SGPR 32-bit registers
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}v_and_i64_br:
|
||||
; SI: v_and_b32
|
||||
; SI: v_and_b32
|
||||
define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %if, label %endif
|
||||
|
||||
if:
|
||||
%a = load i64, i64 addrspace(1)* %aptr, align 8
|
||||
%b = load i64, i64 addrspace(1)* %bptr, align 8
|
||||
%and = and i64 %a, %b
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%tmp1 = phi i64 [%and, %if], [0, %entry]
|
||||
store i64 %tmp1, i64 addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
|
@ -244,26 +244,6 @@ define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
|
|||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_and_i64_br:
|
||||
; SI: v_and_b32
|
||||
; SI: v_and_b32
|
||||
define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
|
||||
entry:
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
br i1 %tmp0, label %if, label %endif
|
||||
|
||||
if:
|
||||
%a = load i64, i64 addrspace(1)* %aptr, align 8
|
||||
%b = load i64, i64 addrspace(1)* %bptr, align 8
|
||||
%and = and i64 %a, %b
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%tmp1 = phi i64 [%and, %if], [0, %entry]
|
||||
store i64 %tmp1, i64 addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_and_constant_i64:
|
||||
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
|
||||
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
|
||||
|
|
|
@ -13,11 +13,12 @@
|
|||
|
||||
; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
|
||||
; GCN: {{^}}BB0_2:
|
||||
define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
|
||||
define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -42,11 +43,12 @@ done:
|
|||
; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
|
||||
; GCN: {{^}}BB1_2:
|
||||
; GCN: s_or_b64 exec
|
||||
define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
|
||||
define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
|
||||
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -68,11 +70,12 @@ done:
|
|||
; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
|
||||
; GCN: {{^}}BB2_2:
|
||||
; GCN: s_or_b64 exec
|
||||
define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
|
||||
define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
|
||||
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -94,11 +97,12 @@ done:
|
|||
; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
|
||||
; GCN: {{^}}BB3_2:
|
||||
; GCN: s_or_b64 exec
|
||||
define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
|
||||
define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
|
||||
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -125,14 +129,15 @@ done:
|
|||
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
|
||||
; GCN: {{^}}BB4_2:
|
||||
define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
|
||||
define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
|
||||
entry:
|
||||
%alloca = alloca [512 x i32], align 4
|
||||
%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%add.arg = add i32 %arg, 8
|
||||
%alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -161,14 +166,15 @@ done:
|
|||
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
|
||||
; GCN: {{^}}BB5_2:
|
||||
define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
|
||||
define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
|
||||
entry:
|
||||
%alloca = alloca [512 x i32], align 4
|
||||
%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%add.arg = add i32 %arg, 8
|
||||
%alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -192,12 +198,13 @@ done:
|
|||
; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GCN: {{^}}BB6_2:
|
||||
define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
|
||||
define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
|
||||
entry:
|
||||
%offset.ext = zext i32 %offset to i64
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -226,11 +233,12 @@ attributes #1 = { nounwind }
|
|||
; GCN: s_and_saveexec_b64
|
||||
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
|
||||
; GCN: s_or_b64 exec, exec
|
||||
define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
|
||||
define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -254,11 +262,12 @@ done:
|
|||
; GCN: s_and_saveexec_b64
|
||||
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
|
||||
; GCN: s_or_b64 exec, exec
|
||||
define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
|
||||
define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -286,11 +295,12 @@ done:
|
|||
|
||||
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
|
||||
; GCN: s_or_b64 exec, exec
|
||||
define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
|
||||
define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -317,11 +327,12 @@ done:
|
|||
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
|
||||
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
|
||||
; GCN: s_or_b64 exec, exec
|
||||
define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
|
||||
define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -347,11 +358,12 @@ done:
|
|||
; GCN: s_addc_u32
|
||||
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
|
||||
; GCN: s_or_b64 exec, exec
|
||||
define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
|
||||
define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -376,11 +388,12 @@ done:
|
|||
; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
|
||||
|
||||
; GCN: s_or_b64 exec, exec
|
||||
define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
|
||||
define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -413,11 +426,12 @@ done:
|
|||
; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
|
||||
|
||||
; GCN: s_or_b64 exec, exec
|
||||
define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
|
||||
define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
|
||||
entry:
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
|
||||
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%tmp0 = icmp eq i32 %tid, 0
|
||||
br i1 %tmp0, label %endif, label %if
|
||||
|
||||
if:
|
||||
|
@ -432,3 +446,7 @@ endif:
|
|||
done:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
|
|
@ -12,8 +12,9 @@
|
|||
; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
|
||||
; CHECK-NOT: s_or_b64 exec, exec
|
||||
; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
|
||||
define void @test(i32 addrspace(1)* %out, i32 %cond) {
|
||||
define void @test(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%cond = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%tmp0 = icmp eq i32 %cond, 0
|
||||
br i1 %tmp0, label %if, label %loop
|
||||
|
||||
|
@ -32,3 +33,7 @@ done:
|
|||
store i32 %inc, i32 addrspace(1)* %tmp3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
|
||||
attributes #0 = { readnone }
|
||||
|
|
|
@ -4,9 +4,8 @@
|
|||
; SILowerI1Copies was not handling IMPLICIT_DEF
|
||||
; SI-LABEL: {{^}}br_implicit_def:
|
||||
; SI: BB#0:
|
||||
; SI-NEXT: s_and_saveexec_b64
|
||||
; SI-NEXT: s_xor_b64
|
||||
; SI-NEXT: BB#1:
|
||||
; SI-NEXT: s_and_b64 vcc, exec
|
||||
; SI-NEXT: s_cbranch_vccnz
|
||||
define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
|
||||
bb:
|
||||
br i1 undef, label %bb1, label %bb2
|
||||
|
|
|
@ -10,9 +10,11 @@
|
|||
; SI: s_and_saveexec_b64
|
||||
; SI: s_xor_b64
|
||||
; SI: s_endpgm
|
||||
define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
|
||||
define void @br_i1_phi(i32 %arg) {
|
||||
bb:
|
||||
br i1 %arg1, label %bb2, label %bb3
|
||||
%tidig = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%cmp = trunc i32 %tidig to i1
|
||||
br i1 %cmp, label %bb2, label %bb3
|
||||
|
||||
bb2: ; preds = %bb
|
||||
br label %bb3
|
||||
|
@ -28,3 +30,7 @@ bb4: ; preds = %bb3
|
|||
bb6: ; preds = %bb4, %bb3
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
|
||||
attributes #0 = { readnone }
|
||||
|
|
|
@ -21,3 +21,21 @@ entry:
|
|||
}
|
||||
|
||||
attributes #0 = { "ShaderType"="0" }
|
||||
|
||||
|
||||
; CHECK: {{^}}branch_on_asm:
|
||||
; Make sure inline assembly is treted as divergent.
|
||||
; CHECK: s_mov_b32 s{{[0-9]+}}, 0
|
||||
; CHECK: s_and_saveexec_b64
|
||||
define void @branch_on_asm(i32 addrspace(1)* %out) {
|
||||
%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
|
||||
%cmp = icmp eq i32 %zero, 0
|
||||
br i1 %cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -193,7 +193,9 @@ bb1: ; preds = %bb2
|
|||
|
||||
bb2: ; preds = %bb6, %bb
|
||||
%tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ]
|
||||
%tmp3 = fsub float undef, %tmp
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
|
||||
%f_tid = bitcast i32 %tid to float
|
||||
%tmp3 = fsub float %f_tid, %tmp
|
||||
%tmp5 = fcmp oeq float %tmp3, 1.000000e+04
|
||||
br i1 %tmp5, label %bb1, label %bb6
|
||||
|
||||
|
@ -203,3 +205,7 @@ bb6: ; preds = %bb2
|
|||
%tmp8 = fadd float %tmp7, undef
|
||||
br label %bb2
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
|
||||
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
|
|
@ -10,8 +10,8 @@
|
|||
; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
|
||||
|
||||
; GCN-NOT: v_mov_b32
|
||||
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
|
||||
; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
|
||||
; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
|
||||
; GCN-NEXT: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
|
||||
; GCN-NOT: v_mov_b32
|
||||
|
||||
; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
|
||||
|
|
|
@ -431,5 +431,33 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; Make sure we legalize vopc operands after moving an sopc to the value.
|
||||
|
||||
; {{^}}sopc_vopc_legalize_bug:
|
||||
; GCN: s_load_dword [[SGPR:s[0-9]+]]
|
||||
; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
|
||||
; GCN: s_and_b64 vcc, exec, vcc
|
||||
; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; GCN-NOHSA: buffer_store_dword [[ONE]]
|
||||
; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
|
||||
; GCN; {{^}}[[EXIT]]:
|
||||
; GCN: s_endpgm
|
||||
define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
bb3: ; preds = %bb2
|
||||
%tmp0 = bitcast i32 %cond to float
|
||||
%tmp1 = fadd float %tmp0, 2.500000e-01
|
||||
%tmp2 = bitcast float %tmp1 to i32
|
||||
%tmp3 = icmp ult i32 %tmp2, %cond
|
||||
br i1 %tmp3, label %bb6, label %bb7
|
||||
|
||||
bb6:
|
||||
store i32 1, i32 addrspace(1)* %out
|
||||
br label %bb7
|
||||
|
||||
bb7: ; preds = %bb3
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
|
|
@ -379,7 +379,7 @@ define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra,
|
|||
; Make sure we don't try to emit i1 setcc ops
|
||||
; FUNC-LABEL: setcc-i1
|
||||
; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
|
||||
; SI: v_cmp_eq_i32_e64 s[0:1], 0, [[AND]]
|
||||
; SI: s_cmp_eq_i32 [[AND]], 0
|
||||
define void @setcc-i1(i32 %in) {
|
||||
%and = and i32 %in, 1
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
|
|
|
@ -10,9 +10,10 @@
|
|||
; SI: s_andn2_b64
|
||||
; s_cbranch_execnz [[LOOP_LABEL]]
|
||||
; SI: s_endpgm
|
||||
define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
|
||||
main_body:
|
||||
%0 = and i32 %a, %b
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%0 = and i32 %a, %tid
|
||||
%1 = trunc i32 %0 to i1
|
||||
br label %ENDIF
|
||||
|
||||
|
@ -39,9 +40,10 @@ ENDIF:
|
|||
; SI: s_cbranch_execnz [[LOOP_LABEL]]
|
||||
; SI: s_endpgm
|
||||
|
||||
define void @phi_cond_outside_loop(i32 %a, i32 %b) {
|
||||
define void @phi_cond_outside_loop(i32 %b) {
|
||||
entry:
|
||||
%0 = icmp eq i32 %a , 0
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%0 = icmp eq i32 %tid , 0
|
||||
br i1 %0, label %if, label %else
|
||||
|
||||
if:
|
||||
|
@ -61,3 +63,7 @@ loop:
|
|||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
|
|
@ -80,7 +80,8 @@ main_body:
|
|||
LOOP: ; preds = %ENDIF2795, %main_body
|
||||
%temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
|
||||
%temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
|
||||
%67 = icmp sgt i32 undef, 4
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
|
||||
%67 = icmp sgt i32 %tid, 4
|
||||
br i1 %67, label %ENDLOOP, label %ENDIF
|
||||
|
||||
ENDLOOP: ; preds = %ELSE2566, %LOOP
|
||||
|
@ -228,13 +229,19 @@ ENDIF: ; preds = %LOOP
|
|||
%199 = fcmp olt float undef, %.temp292.9
|
||||
%200 = and i1 %198, %199
|
||||
%temp292.11 = select i1 %200, float undef, float %.temp292.9
|
||||
br i1 undef, label %IF2565, label %ELSE2566
|
||||
%tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
|
||||
%cmp0 = icmp eq i32 %tid0, 0
|
||||
br i1 %cmp0, label %IF2565, label %ELSE2566
|
||||
|
||||
IF2565: ; preds = %ENDIF
|
||||
br i1 false, label %ENDIF2582, label %ELSE2584
|
||||
%tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
|
||||
%cmp1 = icmp eq i32 %tid1, 0
|
||||
br i1 %cmp1, label %ENDIF2582, label %ELSE2584
|
||||
|
||||
ELSE2566: ; preds = %ENDIF
|
||||
%201 = fcmp oeq float %temp292.11, 1.000000e+04
|
||||
%tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
|
||||
%tidf = bitcast i32 %tid2 to float
|
||||
%201 = fcmp oeq float %temp292.11, %tidf
|
||||
br i1 %201, label %ENDLOOP, label %ELSE2593
|
||||
|
||||
ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588
|
||||
|
@ -248,7 +255,9 @@ ENDIF2564: ; preds = %ENDIF2594, %ENDIF25
|
|||
%207 = fcmp ogt float undef, 0.000000e+00
|
||||
%208 = fcmp olt float undef, 1.000000e+00
|
||||
%209 = and i1 %207, %208
|
||||
%210 = fcmp olt float undef, %206
|
||||
%tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
|
||||
%tidf3 = bitcast i32 %tid3 to float
|
||||
%210 = fcmp olt float %tidf3, %206
|
||||
%211 = and i1 %209, %210
|
||||
br i1 %211, label %ENDIF2795, label %ELSE2797
|
||||
|
||||
|
@ -260,7 +269,9 @@ ENDIF2582: ; preds = %ELSE2584, %IF2565
|
|||
%213 = fadd float 0.000000e+00, %212
|
||||
%floor = call float @llvm.floor.f32(float %213)
|
||||
%214 = fsub float %213, %floor
|
||||
br i1 undef, label %IF2589, label %ELSE2590
|
||||
%tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
|
||||
%cmp4 = icmp eq i32 %tid4, 0
|
||||
br i1 %cmp4, label %IF2589, label %ELSE2590
|
||||
|
||||
IF2589: ; preds = %ENDIF2582
|
||||
br label %ENDIF2588
|
||||
|
@ -479,6 +490,8 @@ ELSE2824: ; preds = %ELSE2821
|
|||
br label %ENDIF2795
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.SI.load.const(<16 x i8>, i32) #2
|
||||
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s
|
||||
|
||||
; GCN-FUNC: {{^}}vccz_workaround:
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
|
||||
; GCN: v_cmp_neq_f32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0
|
||||
; GCN: s_and_b64 vcc, exec, [[MASK]]
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
; VCCZ-BUG: s_mov_b64 vcc, vcc
|
||||
; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
|
||||
; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: [[EXIT]]:
|
||||
; GCN: s_endpgm
|
||||
define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
|
||||
entry:
|
||||
%cnd = fcmp oeq float 0.0, %cond
|
||||
%sgpr = load volatile i32, i32 addrspace(2)* %in
|
||||
br i1 %cnd, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 %sgpr, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-FUNC: {{^}}vccz_noworkaround:
|
||||
; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
|
||||
; GCN: s_and_b64 vcc, exec, vcc
|
||||
; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: [[EXIT]]:
|
||||
; GCN: s_endpgm
|
||||
define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
|
||||
entry:
|
||||
%vgpr = load volatile float, float addrspace(1)* %in
|
||||
%cnd = fcmp oeq float 0.0, %vgpr
|
||||
br i1 %cnd, label %if, label %endif
|
||||
|
||||
if:
|
||||
store float %vgpr, float addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
|
@ -6,9 +6,11 @@ target triple="amdgcn--"
|
|||
; CHECK-LABEL: foobar:
|
||||
; CHECK: s_load_dword s2, s[0:1], 0x9
|
||||
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
|
||||
; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v1
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
||||
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
|
||||
; BB0_1:
|
||||
; CHECK: s_load_dword s6, s[0:1], 0xa
|
||||
|
@ -23,7 +25,9 @@ target triple="amdgcn--"
|
|||
define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
|
||||
entry:
|
||||
%v0 = insertelement <4 x float> undef, float %a0, i32 0
|
||||
br i1 undef, label %ift, label %ife
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
|
||||
%cnd = icmp eq i32 %tid, 0
|
||||
br i1 %cnd, label %ift, label %ife
|
||||
|
||||
ift:
|
||||
%v1 = insertelement <4 x float> undef, float %a1, i32 0
|
||||
|
@ -35,3 +39,7 @@ ife:
|
|||
store float %v2, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
|
|
@ -0,0 +1,365 @@
|
|||
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
|
||||
|
||||
; SI-LABEL: {{^}}uniform_if_scc:
|
||||
; SI-DAG: s_cmp_eq_i32 s{{[0-9]+}}, 0
|
||||
; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
|
||||
; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
|
||||
|
||||
; Fall-through to the else
|
||||
; SI: v_mov_b32_e32 [[STORE_VAL]], 1
|
||||
|
||||
; SI: [[IF_LABEL]]:
|
||||
; SI: buffer_store_dword [[STORE_VAL]]
|
||||
define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%cmp0 = icmp eq i32 %cond, 0
|
||||
br i1 %cmp0, label %if, label %else
|
||||
|
||||
if:
|
||||
br label %done
|
||||
|
||||
else:
|
||||
br label %done
|
||||
|
||||
done:
|
||||
%value = phi i32 [0, %if], [1, %else]
|
||||
store i32 %value, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}uniform_if_vcc:
|
||||
; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
|
||||
; also scheduled the write first.
|
||||
; SI: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
|
||||
; SI: s_and_b64 vcc, exec, [[COND]]
|
||||
; SI: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
|
||||
; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
|
||||
|
||||
; Fall-through to the else
|
||||
; SI: v_mov_b32_e32 [[STORE_VAL]], 1
|
||||
|
||||
; SI: [[IF_LABEL]]:
|
||||
; SI: buffer_store_dword [[STORE_VAL]]
|
||||
define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%cmp0 = fcmp oeq float %cond, 0.0
|
||||
br i1 %cmp0, label %if, label %else
|
||||
|
||||
if:
|
||||
br label %done
|
||||
|
||||
else:
|
||||
br label %done
|
||||
|
||||
done:
|
||||
%value = phi i32 [0, %if], [1, %else]
|
||||
store i32 %value, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}uniform_if_swap_br_targets_scc:
|
||||
; SI-DAG: s_cmp_lg_i32 s{{[0-9]+}}, 0
|
||||
; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
|
||||
; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
|
||||
|
||||
; Fall-through to the else
|
||||
; SI: v_mov_b32_e32 [[STORE_VAL]], 1
|
||||
|
||||
; SI: [[IF_LABEL]]:
|
||||
; SI: buffer_store_dword [[STORE_VAL]]
|
||||
define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%cmp0 = icmp eq i32 %cond, 0
|
||||
br i1 %cmp0, label %else, label %if
|
||||
|
||||
if:
|
||||
br label %done
|
||||
|
||||
else:
|
||||
br label %done
|
||||
|
||||
done:
|
||||
%value = phi i32 [0, %if], [1, %else]
|
||||
store i32 %value, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}uniform_if_swap_br_targets_vcc:
|
||||
; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
|
||||
; also scheduled the write first.
|
||||
; SI: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
|
||||
; SI: s_and_b64 vcc, exec, [[COND]]
|
||||
; SI: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
|
||||
; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
|
||||
|
||||
; Fall-through to the else
|
||||
; SI: v_mov_b32_e32 [[STORE_VAL]], 1
|
||||
|
||||
; SI: [[IF_LABEL]]:
|
||||
; SI: buffer_store_dword [[STORE_VAL]]
|
||||
define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%cmp0 = fcmp oeq float %cond, 0.0
|
||||
br i1 %cmp0, label %else, label %if
|
||||
|
||||
if:
|
||||
br label %done
|
||||
|
||||
else:
|
||||
br label %done
|
||||
|
||||
done:
|
||||
%value = phi i32 [0, %if], [1, %else]
|
||||
store i32 %value, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}uniform_if_move_valu:
|
||||
; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
|
||||
; Using a floating-point value in an integer compare will cause the compare to
|
||||
; be selected for the SALU and then later moved to the VALU.
|
||||
; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
|
||||
; SI: s_and_b64 vcc, exec, [[COND]]
|
||||
; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
|
||||
; SI: buffer_store_dword
|
||||
; SI: [[ENDIF_LABEL]]:
|
||||
; SI: s_endpgm
|
||||
define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
|
||||
entry:
|
||||
%a.0 = fadd float %a, 10.0
|
||||
%cond = bitcast float %a.0 to i32
|
||||
%cmp = icmp eq i32 %cond, 5
|
||||
br i1 %cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}uniform_if_move_valu_commute:
|
||||
; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
|
||||
; Using a floating-point value in an integer compare will cause the compare to
|
||||
; be selected for the SALU and then later moved to the VALU.
|
||||
; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
|
||||
; SI: s_and_b64 vcc, exec, [[COND]]
|
||||
; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
|
||||
; SI: buffer_store_dword
|
||||
; SI: [[ENDIF_LABEL]]:
|
||||
; SI: s_endpgm
|
||||
define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
|
||||
entry:
|
||||
%a.0 = fadd float %a, 10.0
|
||||
%cond = bitcast float %a.0 to i32
|
||||
%cmp = icmp ugt i32 %cond, 5
|
||||
br i1 %cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; SI-LABEL: {{^}}uniform_if_else:
|
||||
; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
|
||||
; SI: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]]
|
||||
; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; SI: buffer_store_dword [[ONE]]
|
||||
; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
|
||||
; SI: [[ELSE_LABEL]]:
|
||||
; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
|
||||
; SI: buffer_store_dword [[TWO]]
|
||||
; SI: [[ENDIF_LABEL]]:
|
||||
; SI: s_endpgm
|
||||
define void @uniform_if_else(i32 addrspace(1)* nocapture %out, i32 %a) {
|
||||
entry:
|
||||
%cmp = icmp eq i32 %a, 0
|
||||
br i1 %cmp, label %if.then, label %if.else
|
||||
|
||||
if.then: ; preds = %entry
|
||||
store i32 1, i32 addrspace(1)* %out
|
||||
br label %if.end
|
||||
|
||||
if.else: ; preds = %entry
|
||||
store i32 2, i32 addrspace(1)* %out
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}icmp_2_users:
|
||||
; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1
|
||||
; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]]
|
||||
; SI: buffer_store_dword
|
||||
; SI: [[LABEL]]:
|
||||
; SI: s_endpgm
|
||||
define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
|
||||
main_body:
|
||||
%0 = icmp sgt i32 %cond, 0
|
||||
%1 = sext i1 %0 to i32
|
||||
br i1 %0, label %IF, label %ENDIF
|
||||
|
||||
IF:
|
||||
store i32 %1, i32 addrspace(1)* %out
|
||||
br label %ENDIF
|
||||
|
||||
ENDIF: ; preds = %IF, %main_body
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}icmp_users_different_blocks:
|
||||
; SI: s_load_dword [[COND:s[0-9]+]]
|
||||
; SI: s_cmp_lt_i32 [[COND]], 1
|
||||
; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
|
||||
; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]]
|
||||
; SI: s_and_b64 vcc, exec, [[MASK]]
|
||||
; SI: s_cbranch_vccnz [[EXIT]]
|
||||
; SI: buffer_store
|
||||
; SI: {{^}}[[EXIT]]:
|
||||
; SI: s_endpgm
|
||||
define void @icmp_users_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.r600.read.tidig.x() #0
|
||||
%tmp1 = icmp sgt i32 %cond, 0
|
||||
br i1 %tmp1, label %bb2, label %bb9
|
||||
|
||||
bb2: ; preds = %bb
|
||||
%tmp2 = sext i1 %tmp1 to i32
|
||||
%tmp3 = add i32 %tmp2, %tmp
|
||||
br i1 %tmp1, label %bb9, label %bb7
|
||||
|
||||
bb7: ; preds = %bb5
|
||||
store i32 %tmp3, i32 addrspace(1)* %out
|
||||
br label %bb9
|
||||
|
||||
bb9: ; preds = %bb8, %bb4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}uniform_loop:
|
||||
; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
|
||||
; FIXME: We need to teach SIFixSGPRCopies about uniform branches so we
|
||||
; get s_add_i32 here.
|
||||
; SI: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}}
|
||||
; SI: v_cmp_ne_i32_e32 vcc, 0, [[I]]
|
||||
; SI: s_and_b64 vcc, exec, vcc
|
||||
; SI: s_cbranch_vccnz [[LOOP_LABEL]]
|
||||
; SI: s_endpgm
|
||||
define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%i = phi i32 [0, %entry], [%i.i, %loop]
|
||||
%i.i = add i32 %i, 1
|
||||
%cmp = icmp eq i32 %a, %i.i
|
||||
br i1 %cmp, label %done, label %loop
|
||||
|
||||
done:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test uniform and divergent.
|
||||
|
||||
; SI-LABEL: {{^}}uniform_inside_divergent:
|
||||
; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
|
||||
; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
|
||||
; SI: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
|
||||
; SI: s_cmp_lg_i32 {{s[0-9]+}}, 0
|
||||
; SI: s_cbranch_scc1 [[ENDIF_LABEL]]
|
||||
; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; SI: buffer_store_dword [[ONE]]
|
||||
define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%d_cmp = icmp ult i32 %tid, 16
|
||||
br i1 %d_cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
%u_cmp = icmp eq i32 %cond, 0
|
||||
br i1 %u_cmp, label %if_uniform, label %endif
|
||||
|
||||
if_uniform:
|
||||
store i32 1, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}divergent_inside_uniform:
|
||||
; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
|
||||
; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
|
||||
; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
|
||||
; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
|
||||
; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; SI: buffer_store_dword [[ONE]]
|
||||
; SI: [[ENDIF_LABEL]]:
|
||||
; SI: s_endpgm
|
||||
define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
|
||||
entry:
|
||||
%u_cmp = icmp eq i32 %cond, 0
|
||||
br i1 %u_cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
%tid = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%d_cmp = icmp ult i32 %tid, 16
|
||||
br i1 %d_cmp, label %if_uniform, label %endif
|
||||
|
||||
if_uniform:
|
||||
store i32 1, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI: {{^}}divergent_if_uniform_if:
|
||||
; SI: v_cmp_eq_i32_e32 vcc, 0, v0
|
||||
; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
|
||||
; SI: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
|
||||
; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; SI: buffer_store_dword [[ONE]]
|
||||
; SI: s_or_b64 exec, exec, [[MASK]]
|
||||
; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
|
||||
; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
|
||||
; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
|
||||
; SI: buffer_store_dword [[TWO]]
|
||||
; SI: [[EXIT]]:
|
||||
; SI: s_endpgm
|
||||
define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%d_cmp = icmp eq i32 %tid, 0
|
||||
br i1 %d_cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 1, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%u_cmp = icmp eq i32 %cond, 0
|
||||
br i1 %u_cmp, label %if_uniform, label %exit
|
||||
|
||||
if_uniform:
|
||||
store i32 2, i32 addrspace(1)* %out
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
|
||||
attributes #0 = { readnone }
|
|
@ -0,0 +1,56 @@
|
|||
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}icmp_2_users:
|
||||
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1
|
||||
; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
|
||||
; GCN: [[LABEL]]:
|
||||
; GCN-NEXT: s_endpgm
|
||||
define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
|
||||
main_body:
|
||||
%0 = icmp sgt i32 %cond, 0
|
||||
%1 = sext i1 %0 to i32
|
||||
br i1 %0, label %IF, label %ENDIF
|
||||
|
||||
IF:
|
||||
store i32 %1, i32 addrspace(1)* %out
|
||||
br label %ENDIF
|
||||
|
||||
ENDIF: ; preds = %IF, %main_body
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fix_sgpr_live_ranges_crash:
|
||||
; GCN: s_cbranch_scc1 [[BB0:[A-Z0-9_]+]]
|
||||
; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
|
||||
; GCN: s_cbranch_scc1 [[LOOP]]
|
||||
; GCN: {{^}}[[BB0]]:
|
||||
define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1) {
|
||||
bb:
|
||||
%cnd = trunc i32 %arg to i1
|
||||
br i1 %cnd, label %bb2, label %bb5
|
||||
|
||||
bb2: ; preds = %bb
|
||||
%tmp = mul i32 10, %arg1
|
||||
br label %bb3
|
||||
|
||||
bb3: ; preds = %bb3, %bb2
|
||||
%tmp4 = icmp eq i32 undef, %arg1
|
||||
br i1 %tmp4, label %bb5, label %bb3
|
||||
|
||||
bb5: ; preds = %bb3, %bb
|
||||
%tmp6 = tail call i32 @llvm.r600.read.tidig.y() #1
|
||||
%tmp10 = icmp ult i32 %tmp6, %arg
|
||||
br i1 %tmp10, label %bb11, label %bb12
|
||||
|
||||
bb11: ; preds = %bb11, %bb5
|
||||
br i1 undef, label %bb11, label %bb12
|
||||
|
||||
bb12: ; preds = %bb11, %bb5
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.tidig.y() #1
|
||||
|
||||
attributes #1 = { nounwind readnone }
|
|
@ -7,9 +7,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|||
; moved using VALU instructions
|
||||
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
|
||||
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
|
||||
define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
|
||||
define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
|
||||
entry:
|
||||
switch i32 %a, label %default [
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
switch i32 %tid, label %default [
|
||||
i32 0, label %case0
|
||||
i32 1, label %case1
|
||||
]
|
||||
|
@ -25,7 +26,7 @@ case1:
|
|||
br label %end
|
||||
|
||||
default:
|
||||
%cmp8 = icmp eq i32 %a, 2
|
||||
%cmp8 = icmp eq i32 %tid, 2
|
||||
%arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
|
||||
br i1 %cmp8, label %if, label %else
|
||||
|
||||
|
@ -80,9 +81,11 @@ exit:
|
|||
; SI: buffer_load_dword
|
||||
; SI-DAG: buffer_store_dword
|
||||
; SI-DAG: v_cmp_eq_i32_e32 vcc,
|
||||
; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
|
||||
; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
|
||||
; SI: s_cbranch_execnz BB2_3
|
||||
; SI-DAG: s_and_b64 vcc, exec, vcc
|
||||
; SI: s_cbranch_vccnz BB2_2
|
||||
; SI: s_branch BB2_3
|
||||
; SI: BB2_2:
|
||||
; SI: s_endpgm
|
||||
|
||||
define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
|
||||
entry:
|
||||
|
|
|
@ -177,7 +177,8 @@ bb12: ; preds = %bb145, %bb
|
|||
%tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
|
||||
%tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
|
||||
%tmp142 = bitcast float %tmp95 to i32
|
||||
%tmp143 = icmp sgt i32 %tmp142, 125
|
||||
%tid = call i32 @llvm.r600.read.tidig.x() #1
|
||||
%tmp143 = icmp sgt i32 %tmp142, %tid
|
||||
br i1 %tmp143, label %bb144, label %bb145
|
||||
|
||||
bb144: ; preds = %bb12
|
||||
|
@ -583,5 +584,7 @@ bb145: ; preds = %bb12
|
|||
br label %bb12
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
|
|
@ -172,7 +172,8 @@ bb24: ; preds = %bb157, %bb
|
|||
%tmp152 = phi float [ 0.000000e+00, %bb ], [ %tmp417, %bb157 ]
|
||||
%tmp153 = phi float [ 0.000000e+00, %bb ], [ %tmp418, %bb157 ]
|
||||
%tmp154 = bitcast float %tmp107 to i32
|
||||
%tmp155 = icmp sgt i32 %tmp154, 125
|
||||
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
|
||||
%tmp155 = icmp sgt i32 %tmp154, %tid
|
||||
br i1 %tmp155, label %bb156, label %bb157
|
||||
|
||||
bb156: ; preds = %bb24
|
||||
|
@ -487,6 +488,8 @@ declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
|
|||
|
||||
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
|
||||
|
||||
attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
||||
|
|
Loading…
Reference in New Issue