forked from OSchip/llvm-project
[AMDGPU] Promote constant offset to the immediate by finding a new base with 13bit constant offset from the nearby instructions.
Summary: Promote constant offset to immediate by recomputing the relative 13bit offset from nearby instructions. E.g. s_movk_i32 s0, 0x1800 v_add_co_u32_e32 v0, vcc, s0, v2 v_addc_co_u32_e32 v1, vcc, 0, v6, vcc s_movk_i32 s0, 0x1000 v_add_co_u32_e32 v5, vcc, s0, v2 v_addc_co_u32_e32 v6, vcc, 0, v6, vcc global_load_dwordx2 v[5:6], v[5:6], off global_load_dwordx2 v[0:1], v[0:1], off => s_movk_i32 s0, 0x1000 v_add_co_u32_e32 v5, vcc, s0, v2 v_addc_co_u32_e32 v6, vcc, 0, v6, vcc global_load_dwordx2 v[5:6], v[5:6], off global_load_dwordx2 v[0:1], v[5:6], off offset:2048 Author: FarhanaAleen Reviewed By: arsenm, rampitec Subscribers: llvm-commits, AMDGPU Differential Revision: https://reviews.llvm.org/D55539 llvm-svn: 349196
This commit is contained in:
parent
261875054e
commit
ce095c564a
|
@ -170,7 +170,6 @@ private:
|
|||
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
|
||||
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
|
||||
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
|
||||
|
||||
unsigned isCFIntrinsic(const SDNode *Intr) const;
|
||||
|
@ -212,6 +211,7 @@ public:
|
|||
SmallVectorImpl<Value*> &/*Ops*/,
|
||||
Type *&/*AccessTy*/) const override;
|
||||
|
||||
bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
|
||||
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
|
||||
unsigned AS,
|
||||
Instruction *I = nullptr) const override;
|
||||
|
|
|
@ -20,6 +20,26 @@
|
|||
// ==>
|
||||
// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
|
||||
//
|
||||
// This pass also tries to promote constant offset to the immediate by
|
||||
// adjusting the base. It tries to use a base from the nearby instructions that
|
||||
// allows it to have a 13bit constant offset and then promotes the 13bit offset
|
||||
// to the immediate.
|
||||
// E.g.
|
||||
// s_movk_i32 s0, 0x1800
|
||||
// v_add_co_u32_e32 v0, vcc, s0, v2
|
||||
// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
|
||||
//
|
||||
// s_movk_i32 s0, 0x1000
|
||||
// v_add_co_u32_e32 v5, vcc, s0, v2
|
||||
// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
|
||||
// global_load_dwordx2 v[5:6], v[5:6], off
|
||||
// global_load_dwordx2 v[0:1], v[0:1], off
|
||||
// =>
|
||||
// s_movk_i32 s0, 0x1000
|
||||
// v_add_co_u32_e32 v5, vcc, s0, v2
|
||||
// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
|
||||
// global_load_dwordx2 v[5:6], v[5:6], off
|
||||
// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
|
||||
//
|
||||
// Future improvements:
|
||||
//
|
||||
|
@ -116,6 +136,21 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
|
|||
SmallVector<MachineInstr *, 8> InstsToMove;
|
||||
};
|
||||
|
||||
struct BaseRegisters {
|
||||
unsigned LoReg = 0;
|
||||
unsigned HiReg = 0;
|
||||
|
||||
unsigned LoSubReg = 0;
|
||||
unsigned HiSubReg = 0;
|
||||
};
|
||||
|
||||
struct MemAddress {
|
||||
BaseRegisters Base;
|
||||
int64_t Offset = 0;
|
||||
};
|
||||
|
||||
using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
|
||||
|
||||
private:
|
||||
const GCNSubtarget *STM = nullptr;
|
||||
const SIInstrInfo *TII = nullptr;
|
||||
|
@ -146,6 +181,19 @@ private:
|
|||
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
|
||||
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
|
||||
|
||||
void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
|
||||
int32_t NewOffset);
|
||||
unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
|
||||
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
|
||||
Optional<int32_t> extractConstOffset(const MachineOperand &Op);
|
||||
void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
|
||||
/// Promotes constant offset to the immediate by adjusting the base. It
|
||||
/// tries to use a base from the nearby instructions that allows it to have
|
||||
/// a 13bit constant offset which gets promoted to the immediate.
|
||||
bool promoteConstantOffsetToImm(MachineInstr &CI,
|
||||
MemInfoMap &Visited,
|
||||
SmallPtrSet<MachineInstr *, 4> &Promoted);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
|
@ -1053,15 +1101,328 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
|
|||
return Next;
|
||||
}
|
||||
|
||||
MachineOperand
|
||||
SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
|
||||
APInt V(32, Val, true);
|
||||
if (TII->isInlineConstant(V))
|
||||
return MachineOperand::CreateImm(Val);
|
||||
|
||||
unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
MachineInstr *Mov =
|
||||
BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
|
||||
TII->get(AMDGPU::S_MOV_B32), Reg)
|
||||
.addImm(Val);
|
||||
LLVM_DEBUG(dbgs() << " "; Mov->dump());
|
||||
return MachineOperand::CreateReg(Reg, false);
|
||||
}
|
||||
|
||||
// Compute base address using Addr and return the final register.
|
||||
unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
|
||||
const MemAddress &Addr) {
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
MachineBasicBlock::iterator MBBI = MI.getIterator();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
|
||||
assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
|
||||
Addr.Base.LoSubReg) &&
|
||||
"Expected 32-bit Base-Register-Low!!");
|
||||
|
||||
assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
|
||||
Addr.Base.HiSubReg) &&
|
||||
"Expected 32-bit Base-Register-Hi!!");
|
||||
|
||||
LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
|
||||
MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
|
||||
MachineOperand OffsetHi =
|
||||
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
|
||||
unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
|
||||
unsigned DeadCarryReg =
|
||||
MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
|
||||
|
||||
unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
MachineInstr *LoHalf =
|
||||
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
|
||||
.addReg(CarryReg, RegState::Define)
|
||||
.addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
|
||||
.add(OffsetLo);
|
||||
LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
|
||||
|
||||
MachineInstr *HiHalf =
|
||||
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
|
||||
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
|
||||
.addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
|
||||
.add(OffsetHi)
|
||||
.addReg(CarryReg, RegState::Kill);
|
||||
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
|
||||
|
||||
unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
|
||||
MachineInstr *FullBase =
|
||||
BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
|
||||
.addReg(DestSub0)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(DestSub1)
|
||||
.addImm(AMDGPU::sub1);
|
||||
LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
|
||||
|
||||
return FullDestReg;
|
||||
}
|
||||
|
||||
// Update base and offset with the NewBase and NewOffset in MI.
|
||||
void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
|
||||
unsigned NewBase,
|
||||
int32_t NewOffset) {
|
||||
TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
|
||||
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
|
||||
}
|
||||
|
||||
Optional<int32_t>
|
||||
SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
|
||||
if (Op.isImm())
|
||||
return Op.getImm();
|
||||
|
||||
if (!Op.isReg())
|
||||
return None;
|
||||
|
||||
MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
|
||||
if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
|
||||
!Def->getOperand(1).isImm())
|
||||
return None;
|
||||
|
||||
return Def->getOperand(1).getImm();
|
||||
}
|
||||
|
||||
// Analyze Base and extracts:
|
||||
// - 32bit base registers, subregisters
|
||||
// - 64bit constant offset
|
||||
// Expecting base computation as:
|
||||
// %OFFSET0:sgpr_32 = S_MOV_B32 8000
|
||||
// %LO:vgpr_32, %c:sreg_64_xexec =
|
||||
// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
|
||||
// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
|
||||
// %Base:vreg_64 =
|
||||
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
|
||||
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
|
||||
MemAddress &Addr) {
|
||||
if (!Base.isReg())
|
||||
return;
|
||||
|
||||
MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
|
||||
if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
|
||||
|| Def->getNumOperands() != 5)
|
||||
return;
|
||||
|
||||
MachineOperand BaseLo = Def->getOperand(1);
|
||||
MachineOperand BaseHi = Def->getOperand(3);
|
||||
if (!BaseLo.isReg() || !BaseHi.isReg())
|
||||
return;
|
||||
|
||||
MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
|
||||
MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
|
||||
|
||||
if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
|
||||
!BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
|
||||
return;
|
||||
|
||||
const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
|
||||
const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
|
||||
|
||||
auto Offset0P = extractConstOffset(*Src0);
|
||||
if (Offset0P)
|
||||
BaseLo = *Src1;
|
||||
else {
|
||||
if (!(Offset0P = extractConstOffset(*Src1)))
|
||||
return;
|
||||
BaseLo = *Src0;
|
||||
}
|
||||
|
||||
Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
|
||||
Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
|
||||
|
||||
if (Src0->isImm())
|
||||
std::swap(Src0, Src1);
|
||||
|
||||
if (!Src1->isImm())
|
||||
return;
|
||||
|
||||
assert(isInt<32>(*Offset0P) && isInt<32>(Src1->getImm())
|
||||
&& "Expected 32bit immediate!!!");
|
||||
uint64_t Offset1 = Src1->getImm();
|
||||
BaseHi = *Src0;
|
||||
|
||||
Addr.Base.LoReg = BaseLo.getReg();
|
||||
Addr.Base.HiReg = BaseHi.getReg();
|
||||
Addr.Base.LoSubReg = BaseLo.getSubReg();
|
||||
Addr.Base.HiSubReg = BaseHi.getSubReg();
|
||||
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
|
||||
}
|
||||
|
||||
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
|
||||
MachineInstr &MI,
|
||||
MemInfoMap &Visited,
|
||||
SmallPtrSet<MachineInstr *, 4> &AnchorList) {
|
||||
|
||||
// TODO: Support flat and scratch.
|
||||
if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
|
||||
TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
|
||||
return false;
|
||||
|
||||
// TODO: Support Store.
|
||||
if (!MI.mayLoad())
|
||||
return false;
|
||||
|
||||
if (AnchorList.count(&MI))
|
||||
return false;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
|
||||
|
||||
if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
|
||||
LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step1: Find the base-registers and a 64bit constant offset.
|
||||
MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
|
||||
MemAddress MAddr;
|
||||
if (Visited.find(&MI) == Visited.end()) {
|
||||
processBaseWithConstOffset(Base, MAddr);
|
||||
Visited[&MI] = MAddr;
|
||||
} else
|
||||
MAddr = Visited[&MI];
|
||||
|
||||
if (MAddr.Offset == 0) {
|
||||
LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
|
||||
" constant offsets that can be promoted.\n";);
|
||||
return false;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
|
||||
<< MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
|
||||
|
||||
// Step2: Traverse through MI's basic block and find an anchor(that has the
|
||||
// same base-registers) with the highest 13bit distance from MI's offset.
|
||||
// E.g. (64bit loads)
|
||||
// bb:
|
||||
// addr1 = &a + 4096; load1 = load(addr1, 0)
|
||||
// addr2 = &a + 6144; load2 = load(addr2, 0)
|
||||
// addr3 = &a + 8192; load3 = load(addr3, 0)
|
||||
// addr4 = &a + 10240; load4 = load(addr4, 0)
|
||||
// addr5 = &a + 12288; load5 = load(addr5, 0)
|
||||
//
|
||||
// Starting from the first load, the optimization will try to find a new base
|
||||
// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
|
||||
// has 13bit distance from &a + 4096. The heuristic considers &a + 8192
|
||||
// as the new-base(anchor) because of the maximum distance which can
|
||||
// accomodate more intermediate bases presumeably.
|
||||
//
|
||||
// Step3: move (&a + 8192) above load1. Compute and promote offsets from
|
||||
// (&a + 8192) for load1, load2, load4.
|
||||
// addr = &a + 8192
|
||||
// load1 = load(addr, -4096)
|
||||
// load2 = load(addr, -2048)
|
||||
// load3 = load(addr, 0)
|
||||
// load4 = load(addr, 2048)
|
||||
// addr5 = &a + 12288; load5 = load(addr5, 0)
|
||||
//
|
||||
MachineInstr *AnchorInst = nullptr;
|
||||
MemAddress AnchorAddr;
|
||||
uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
|
||||
SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
|
||||
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
MachineBasicBlock::iterator E = MBB->end();
|
||||
MachineBasicBlock::iterator MBBI = MI.getIterator();
|
||||
++MBBI;
|
||||
const SITargetLowering *TLI =
|
||||
static_cast<const SITargetLowering *>(STM->getTargetLowering());
|
||||
|
||||
for ( ; MBBI != E; ++MBBI) {
|
||||
MachineInstr &MINext = *MBBI;
|
||||
// TODO: Support finding an anchor(with same base) from store addresses or
|
||||
// any other load addresses where the opcodes are different.
|
||||
if (MINext.getOpcode() != MI.getOpcode() ||
|
||||
TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
|
||||
continue;
|
||||
|
||||
const MachineOperand &BaseNext =
|
||||
*TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
|
||||
MemAddress MAddrNext;
|
||||
if (Visited.find(&MINext) == Visited.end()) {
|
||||
processBaseWithConstOffset(BaseNext, MAddrNext);
|
||||
Visited[&MINext] = MAddrNext;
|
||||
} else
|
||||
MAddrNext = Visited[&MINext];
|
||||
|
||||
if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
|
||||
MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
|
||||
MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
|
||||
MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
|
||||
continue;
|
||||
|
||||
InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
|
||||
|
||||
int64_t Dist = MAddr.Offset - MAddrNext.Offset;
|
||||
TargetLoweringBase::AddrMode AM;
|
||||
AM.HasBaseReg = true;
|
||||
AM.BaseOffs = Dist;
|
||||
if (TLI->isLegalGlobalAddressingMode(AM) &&
|
||||
(uint32_t)abs(Dist) > MaxDist) {
|
||||
MaxDist = abs(Dist);
|
||||
|
||||
AnchorAddr = MAddrNext;
|
||||
AnchorInst = &MINext;
|
||||
}
|
||||
}
|
||||
|
||||
if (AnchorInst) {
|
||||
LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
|
||||
AnchorInst->dump());
|
||||
LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
|
||||
<< AnchorAddr.Offset << "\n\n");
|
||||
|
||||
// Instead of moving up, just re-compute anchor-instruction's base address.
|
||||
unsigned Base = computeBase(MI, AnchorAddr);
|
||||
|
||||
updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
|
||||
LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
|
||||
|
||||
for (auto P : InstsWCommonBase) {
|
||||
TargetLoweringBase::AddrMode AM;
|
||||
AM.HasBaseReg = true;
|
||||
AM.BaseOffs = P.second - AnchorAddr.Offset;
|
||||
|
||||
if (TLI->isLegalGlobalAddressingMode(AM)) {
|
||||
LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
|
||||
dbgs() << ")"; P.first->dump());
|
||||
updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
|
||||
LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
|
||||
}
|
||||
}
|
||||
AnchorList.insert(AnchorInst);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Scan through looking for adjacent LDS operations with constant offsets from
|
||||
// the same base register. We rely on the scheduler to do the hard work of
|
||||
// clustering nearby loads, and assume these are all adjacent.
|
||||
bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
|
||||
bool Modified = false;
|
||||
|
||||
// Contain the list
|
||||
MemInfoMap Visited;
|
||||
// Contains the list of instructions for which constant offsets are being
|
||||
// promoted to the IMM.
|
||||
SmallPtrSet<MachineInstr *, 4> AnchorList;
|
||||
|
||||
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
|
||||
MachineInstr &MI = *I;
|
||||
|
||||
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
|
||||
Modified = true;
|
||||
|
||||
// Don't combine if volatile.
|
||||
if (MI.hasOrderedMemoryRef()) {
|
||||
++I;
|
||||
|
|
|
@ -0,0 +1,485 @@
|
|||
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
declare i64 @_Z13get_global_idj(i32)
|
||||
|
||||
define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
|
||||
; GCN-LABEL: clmem_read_simplified:
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
||||
%conv = and i64 %call, 255
|
||||
%a0 = shl i64 %call, 7
|
||||
%idx.ext11 = and i64 %a0, 4294934528
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
||||
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
||||
|
||||
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
|
||||
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
||||
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
|
||||
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
||||
%add.1 = add i64 %load2, %load1
|
||||
|
||||
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
|
||||
%load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
|
||||
%add.2 = add i64 %load3, %add.1
|
||||
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
|
||||
%load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
||||
%add.3 = add i64 %load4, %add.2
|
||||
|
||||
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
|
||||
%load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
||||
%add.4 = add i64 %load5, %add.3
|
||||
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
|
||||
%load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
||||
%add.5 = add i64 %load6, %add.4
|
||||
|
||||
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
|
||||
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
||||
%add.6 = add i64 %load7, %add.5
|
||||
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
|
||||
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
||||
%add.7 = add i64 %load8, %add.6
|
||||
|
||||
store i64 %add.7, i64 addrspace(1)* %saddr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
|
||||
; GCN-LABEL: clmem_read:
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
||||
%conv = and i64 %call, 255
|
||||
%a0 = shl i64 %call, 17
|
||||
%idx.ext11 = and i64 %a0, 4261412864
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
||||
%a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
||||
%add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
|
||||
br label %for.cond.preheader
|
||||
|
||||
while.cond.loopexit: ; preds = %for.body
|
||||
%dec = add nsw i32 %dec31, -1
|
||||
%tobool = icmp eq i32 %dec31, 0
|
||||
br i1 %tobool, label %while.end, label %for.cond.preheader
|
||||
|
||||
for.cond.preheader: ; preds = %entry, %while.cond.loopexit
|
||||
%dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
|
||||
%sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body, %for.cond.preheader
|
||||
%block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
|
||||
%sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
|
||||
%conv3 = zext i32 %block.029 to i64
|
||||
%add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
|
||||
%load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
|
||||
%add = add i64 %load1, %sum.128
|
||||
|
||||
%add9 = or i32 %block.029, 256
|
||||
%conv3.1 = zext i32 %add9 to i64
|
||||
%add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
|
||||
%load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
|
||||
%add.1 = add i64 %load2, %add
|
||||
|
||||
%add9.1 = or i32 %block.029, 512
|
||||
%conv3.2 = zext i32 %add9.1 to i64
|
||||
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
|
||||
%l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
|
||||
%add.2 = add i64 %l3, %add.1
|
||||
|
||||
%add9.2 = or i32 %block.029, 768
|
||||
%conv3.3 = zext i32 %add9.2 to i64
|
||||
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
|
||||
%l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
||||
%add.3 = add i64 %l4, %add.2
|
||||
|
||||
%add9.3 = or i32 %block.029, 1024
|
||||
%conv3.4 = zext i32 %add9.3 to i64
|
||||
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
|
||||
%l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
||||
%add.4 = add i64 %l5, %add.3
|
||||
|
||||
%add9.4 = or i32 %block.029, 1280
|
||||
%conv3.5 = zext i32 %add9.4 to i64
|
||||
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
|
||||
%l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
||||
%add.5 = add i64 %l6, %add.4
|
||||
|
||||
%add9.5 = or i32 %block.029, 1536
|
||||
%conv3.6 = zext i32 %add9.5 to i64
|
||||
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
|
||||
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
||||
%add.6 = add i64 %load7, %add.5
|
||||
|
||||
%add9.6 = or i32 %block.029, 1792
|
||||
%conv3.7 = zext i32 %add9.6 to i64
|
||||
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
|
||||
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
||||
%add.7 = add i64 %load8, %add.6
|
||||
|
||||
%add9.7 = or i32 %block.029, 2048
|
||||
%conv3.8 = zext i32 %add9.7 to i64
|
||||
%add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
|
||||
%load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
|
||||
%add.8 = add i64 %load9, %add.7
|
||||
|
||||
%add9.8 = or i32 %block.029, 2304
|
||||
%conv3.9 = zext i32 %add9.8 to i64
|
||||
%add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
|
||||
%load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
|
||||
%add.9 = add i64 %load10, %add.8
|
||||
|
||||
%add9.9 = or i32 %block.029, 2560
|
||||
%conv3.10 = zext i32 %add9.9 to i64
|
||||
%add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
|
||||
%load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
|
||||
%add.10 = add i64 %load11, %add.9
|
||||
|
||||
%add9.31 = add nuw nsw i32 %block.029, 8192
|
||||
%cmp.31 = icmp ult i32 %add9.31, 4194304
|
||||
br i1 %cmp.31, label %for.body, label %while.cond.loopexit
|
||||
|
||||
while.end: ; preds = %while.cond.loopexit
|
||||
store i64 %add.10, i64 addrspace(1)* %a1, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; using 32bit address.
|
||||
define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
|
||||
; GCN-LABEL: Address32:
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
||||
%conv = and i64 %call, 255
|
||||
%id = shl i64 %call, 7
|
||||
%idx.ext11 = and i64 %id, 4294934528
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
||||
%addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
|
||||
|
||||
%add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
|
||||
%load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
|
||||
|
||||
%add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
|
||||
%load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
|
||||
%add.1 = add i32 %load2, %load1
|
||||
|
||||
%add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
|
||||
%load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
|
||||
%add.2 = add i32 %load3, %add.1
|
||||
|
||||
%add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
|
||||
%load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
|
||||
%add.3 = add i32 %load4, %add.2
|
||||
|
||||
%add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
|
||||
%load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
|
||||
%add.4 = add i32 %load5, %add.3
|
||||
|
||||
%add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
|
||||
%load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
|
||||
%add.5 = add i32 %load6, %add.4
|
||||
|
||||
%add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
|
||||
%load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
|
||||
%add.6 = add i32 %load7, %add.5
|
||||
|
||||
%add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
|
||||
%load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
|
||||
%add.7 = add i32 %load8, %add.6
|
||||
|
||||
%add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
|
||||
%load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
|
||||
%add.8 = add i32 %load9, %add.7
|
||||
|
||||
%add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
|
||||
%load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
|
||||
%add.9 = add i32 %load10, %add.8
|
||||
|
||||
store i32 %add.9, i32 addrspace(1)* %addr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
|
||||
; GCN-LABEL: Offset64:
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
||||
%conv = and i64 %call, 255
|
||||
%a0 = shl i64 %call, 7
|
||||
%idx.ext11 = and i64 %a0, 4294934528
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
||||
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
||||
|
||||
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
|
||||
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
||||
|
||||
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
|
||||
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
||||
|
||||
%add1 = add i64 %load2, %load1
|
||||
|
||||
%addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
|
||||
%load3 = load i64, i64 addrspace(1)* %addr3, align 8
|
||||
|
||||
%add2 = add i64 %load3, %add1
|
||||
|
||||
%addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
|
||||
%load4 = load i64, i64 addrspace(1)* %addr4, align 8
|
||||
%add4 = add i64 %load4, %add2
|
||||
|
||||
store i64 %add4, i64 addrspace(1)* %saddr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: Support load4 as anchor instruction.
|
||||
define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) {
|
||||
; GCN-LABEL: p32Offset64:
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024
|
||||
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
||||
%conv = and i64 %call, 255
|
||||
%a0 = shl i64 %call, 7
|
||||
%idx.ext11 = and i64 %a0, 4294934528
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
||||
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
|
||||
|
||||
%addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
|
||||
%load1 = load i32, i32 addrspace(1)* %addr1, align 8
|
||||
|
||||
%addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
|
||||
%load2 = load i32, i32 addrspace(1)* %addr2, align 8
|
||||
|
||||
%add1 = add i32 %load2, %load1
|
||||
|
||||
%addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
|
||||
%load3 = load i32, i32 addrspace(1)* %addr3, align 8
|
||||
|
||||
%add2 = add i32 %load3, %add1
|
||||
|
||||
%addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
|
||||
%load4 = load i32, i32 addrspace(1)* %addr4, align 8
|
||||
%add4 = add i32 %load4, %add2
|
||||
|
||||
store i32 %add4, i32 addrspace(1)* %saddr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
|
||||
; GCN-LABEL: DiffBase:
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
i8 addrspace(1)* %buffer2) {
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
||||
%conv = and i64 %call, 255
|
||||
%a0 = shl i64 %call, 7
|
||||
%idx.ext11 = and i64 %a0, 4294934528
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
|
||||
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
||||
|
||||
%add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
|
||||
%saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
|
||||
|
||||
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
|
||||
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
||||
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
|
||||
%load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
||||
%add1 = add i64 %load2, %load1
|
||||
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
|
||||
%load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
||||
%add2 = add i64 %load3, %add1
|
||||
|
||||
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
|
||||
%load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
||||
|
||||
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
|
||||
%load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
||||
%add3 = add i64 %load5, %load4
|
||||
|
||||
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
|
||||
%load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
||||
%add4 = add i64 %load6, %add3
|
||||
|
||||
%add5 = add i64 %add2, %add4
|
||||
|
||||
store i64 %add5, i64 addrspace(1)* %saddr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
|
||||
; GCN-LABEL: ReverseOrder:
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
||||
%conv = and i64 %call, 255
|
||||
%a0 = shl i64 %call, 7
|
||||
%idx.ext11 = and i64 %a0, 4294934528
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
||||
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
||||
|
||||
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
|
||||
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
||||
|
||||
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
|
||||
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
||||
%add7 = add i64 %load8, %load1
|
||||
|
||||
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
|
||||
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
||||
%add6 = add i64 %load7, %add7
|
||||
|
||||
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
|
||||
%load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
||||
%add5 = add i64 %load6, %add6
|
||||
|
||||
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
|
||||
%load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
||||
%add4 = add i64 %load5, %add5
|
||||
|
||||
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
|
||||
%load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
||||
%add3 = add i64 %load4, %add4
|
||||
|
||||
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
|
||||
%load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
|
||||
%add2 = add i64 %load3, %add3
|
||||
|
||||
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
|
||||
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
||||
%add1 = add i64 %load2, %add2
|
||||
|
||||
store i64 %add1, i64 addrspace(1)* %saddr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
|
||||
; GCN-LABEL: negativeoffset:
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
||||
;
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
||||
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
||||
entry:
|
||||
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
|
||||
%conv = and i64 %call, 255
|
||||
%0 = shl i64 %call, 7
|
||||
%idx.ext11 = and i64 %0, 4294934528
|
||||
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
||||
%buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
||||
|
||||
%buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
|
||||
|
||||
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
|
||||
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
||||
|
||||
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
|
||||
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
||||
|
||||
|
||||
%add = add i64 %load2, %load1
|
||||
|
||||
store i64 %add, i64 addrspace(1)* %buffer_head, align 8
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
|
||||
|
||||
# GFX9-LABEL: name: diffoporder_add
|
||||
# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, -2048, 0, 0
|
||||
# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, 0, 0, 0
|
||||
|
||||
name: diffoporder_add
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0:sgpr_64 = COPY $sgpr0_sgpr1
|
||||
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
|
||||
%3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
|
||||
%4:sreg_32_xm0 = COPY $sgpr101
|
||||
%5:sreg_32_xm0 = S_MOV_B32 0
|
||||
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
|
||||
$sgpr4 = COPY %4
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
%6:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
|
||||
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
|
||||
%10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
|
||||
%11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
|
||||
%12:sgpr_32 = COPY %1.sub1
|
||||
%13:vgpr_32 = COPY %5
|
||||
%14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
|
||||
%16:vgpr_32 = COPY %12
|
||||
%17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
|
||||
%19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
|
||||
%20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
|
||||
%21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
|
||||
%23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
|
||||
%25:sgpr_32 = S_MOV_B32 4096
|
||||
%26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %25, %21, implicit $exec
|
||||
%28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
|
||||
%30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
|
||||
%31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
|
||||
%32:sgpr_32 = S_MOV_B32 6144
|
||||
%33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
|
||||
%35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
|
||||
%37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
|
||||
%38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
|
||||
...
|
||||
---
|
||||
|
||||
# GFX9-LABEL: name: LowestInMiddle
|
||||
# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 11200
|
||||
# GFX9: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]]
|
||||
# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]]
|
||||
# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1
|
||||
# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -3200, 0, 0
|
||||
#
|
||||
# GFX9: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 6400
|
||||
# GFX9: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]]
|
||||
# GFX9: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_7]]
|
||||
# GFX9: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1
|
||||
# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0, 0,
|
||||
# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0,
|
||||
|
||||
name: LowestInMiddle
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0:sgpr_64 = COPY $sgpr0_sgpr1
|
||||
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
|
||||
%3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
|
||||
%4:sreg_32_xm0 = COPY $sgpr101
|
||||
%5:sreg_32_xm0 = S_MOV_B32 0
|
||||
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
|
||||
$sgpr4 = COPY %4
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
%6:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
|
||||
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
|
||||
%10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
|
||||
%11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
|
||||
%12:sgpr_32 = COPY %1.sub1
|
||||
%13:vgpr_32 = COPY %5
|
||||
%14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
|
||||
%16:vgpr_32 = COPY %12
|
||||
%17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
|
||||
%19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
|
||||
%20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
|
||||
%21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
|
||||
%23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
|
||||
%25:sgpr_32 = S_MOV_B32 8000
|
||||
%26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec
|
||||
%28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
|
||||
%30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
|
||||
%31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
|
||||
%32:sgpr_32 = S_MOV_B32 6400
|
||||
%33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
|
||||
%35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
|
||||
%37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
|
||||
%38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
|
||||
%39:sgpr_32 = S_MOV_B32 11200
|
||||
%40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec
|
||||
%42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec
|
||||
%44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
|
||||
%45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec
|
||||
...
|
||||
---
|
||||
|
||||
# GFX9-LABEL: name: NegativeDistance
|
||||
# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 10240
|
||||
# GFX9: [[V_ADD_I32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]]
|
||||
# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]]
|
||||
# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_4]], %subreg.sub0, [[BASE_HI]], %subreg.sub1
|
||||
# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -4096, 0, 0
|
||||
# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0, 0
|
||||
# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0
|
||||
|
||||
name: NegativeDistance
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0:sgpr_64 = COPY $sgpr0_sgpr1
|
||||
%1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0
|
||||
%3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
|
||||
%4:sreg_32_xm0 = COPY $sgpr101
|
||||
%5:sreg_32_xm0 = S_MOV_B32 0
|
||||
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3
|
||||
$sgpr4 = COPY %4
|
||||
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
|
||||
%6:vreg_64 = COPY $vgpr0_vgpr1
|
||||
%7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec
|
||||
%8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
%9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1
|
||||
%10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec
|
||||
%11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec
|
||||
%12:sgpr_32 = COPY %1.sub1
|
||||
%13:vgpr_32 = COPY %5
|
||||
%14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec
|
||||
%16:vgpr_32 = COPY %12
|
||||
%17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec
|
||||
%19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1
|
||||
%20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec
|
||||
%21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec
|
||||
%23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec
|
||||
%25:sgpr_32 = S_MOV_B32 6144
|
||||
%26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec
|
||||
%28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec
|
||||
%30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1
|
||||
%31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec
|
||||
%32:sgpr_32 = S_MOV_B32 8192
|
||||
%33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec
|
||||
%35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec
|
||||
%37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1
|
||||
%38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec
|
||||
%39:sgpr_32 = S_MOV_B32 10240
|
||||
%40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec
|
||||
%42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec
|
||||
%44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1
|
||||
%45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec
|
||||
...
|
Loading…
Reference in New Issue