[AMDGPU] Experiments show that the GCNRegBankReassign pass significantly impacts

the compilation time and there is no case for which we see any improvement in
performance. This patch removes this pass and its associated test cases from
the tree.

Differential Revision: https://reviews.llvm.org/D101313

Change-Id: I0599169a7609c19a887f8d847a71e664030cc141
This commit is contained in:
Baptiste Saleil 2021-04-26 15:48:12 -04:00
parent 84d16e2055
commit caf1294d95
78 changed files with 2160 additions and 3992 deletions

View File

@ -74,16 +74,6 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *createAMDGPULowerModuleLDSPass();
FunctionPass *createSIModeRegisterPass();
namespace AMDGPU {
enum RegBankReassignMode {
RM_VGPR = 1,
RM_SGPR = 2,
RM_BOTH = RM_VGPR | RM_SGPR
};
}
MachineFunctionPass *
createGCNRegBankReassignPass(AMDGPU::RegBankReassignMode Mode);
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@ -342,9 +332,6 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
void initializeGCNRegBankReassignPass(PassRegistry &);
extern char &GCNRegBankReassignID;
void initializeGCNNSAReassignPass(PassRegistry &);
extern char &GCNNSAReassignID;

View File

@ -262,7 +262,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNRegBankReassignPass(*PR);
initializeGCNNSAReassignPass(*PR);
}
@ -1177,10 +1176,8 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}
bool GCNPassConfig::addPreRewrite() {
if (EnableRegReassign) {
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
addPass(createGCNRegBankReassignPass(AMDGPU::RM_BOTH));
}
return true;
}

View File

@ -139,7 +139,6 @@ add_llvm_target(AMDGPUCodeGen
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
GCNILPSched.cpp
GCNRegBankReassign.cpp
GCNNSAReassign.cpp
GCNDPPCombine.cpp
SIModeRegister.cpp

View File

@ -1,900 +0,0 @@
//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief Try to reassign registers on GFX10+ to reduce register bank
/// conflicts.
///
/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
///
/// The shader can read one dword from each of these banks once per cycle.
/// If an instruction has to read more register operands from the same bank
/// an additional cycle is needed. HW attempts to pre-load registers through
/// input operand gathering, but a stall cycle may occur if that fails. For
/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
/// potentially incuring 2 stall cycles.
///
/// The pass tries to reassign registers to reduce bank conflicts.
///
/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
/// that 4 has to be subtracted from an SGPR bank number to get the real value.
/// This also corresponds to bit numbers in bank masks used in the pass.
///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
using namespace AMDGPU;
static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
cl::desc("Verify stall cycles in the regbanks reassign pass"),
cl::value_desc("0|1|2"),
cl::init(0), cl::Hidden);
// Threshold to keep compile time reasonable.
static cl::opt<unsigned> VRegThresh("amdgpu-regbanks-reassign-threshold",
cl::desc("Max number of vregs to run the regbanks reassign pass"),
cl::init(15000), cl::Hidden);
#define DEBUG_TYPE "amdgpu-regbanks-reassign"
#define NUM_VGPR_BANKS 4
#define NUM_SGPR_BANKS 8
#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
#define VGPR_BANK_MASK 0xf
#define SGPR_BANK_MASK 0xff0
#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
STATISTIC(NumStallsDetected,
"Number of operand read stalls detected");
STATISTIC(NumStallsRecovered,
"Number of operand read stalls recovered");
namespace {
class GCNRegBankReassign : public MachineFunctionPass {
class OperandMask {
public:
OperandMask(unsigned r, unsigned s, unsigned m)
: Reg(r), SubReg(s), Mask(m) {}
Register Reg;
unsigned SubReg;
unsigned Mask;
};
class Candidate {
public:
Candidate(MachineInstr *mi, Register reg, unsigned subreg,
unsigned freebanks)
: MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const GCNRegBankReassign *P) const {
MI->dump();
dbgs() << P->printReg(Reg) << " to banks ";
dumpFreeBanks(FreeBanks);
dbgs() << '\n';
}
#endif
MachineInstr *MI;
Register Reg;
unsigned SubReg;
unsigned FreeBanks;
};
class CandidateList : public std::map<unsigned, std::list<Candidate>> {
public:
void push(unsigned Weight, const Candidate&& C) {
operator[](Weight).push_front(C);
}
Candidate &back() {
return rbegin()->second.back();
}
void pop_back() {
rbegin()->second.pop_back();
if (rbegin()->second.empty())
erase(rbegin()->first);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const GCNRegBankReassign *P) const {
dbgs() << "\nCandidates:\n\n";
for (auto &B : *this) {
dbgs() << " Weight " << B.first << ":\n";
for (auto &C : B.second)
C.dump(P);
}
dbgs() << "\n\n";
}
#endif
};
public:
static char ID;
public:
GCNRegBankReassign(RegBankReassignMode Mode = RM_BOTH)
: MachineFunctionPass(ID), Mode(Mode) {
initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override { return "GCN RegBank Reassign"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>();
AU.addRequired<LiveIntervals>();
AU.addRequired<VirtRegMap>();
AU.addRequired<LiveRegMatrix>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
private:
const GCNSubtarget *ST;
const MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
MachineLoopInfo *MLI;
VirtRegMap *VRM;
LiveRegMatrix *LRM;
LiveIntervals *LIS;
RegBankReassignMode Mode;
unsigned MaxNumVGPRs;
unsigned MaxNumSGPRs;
BitVector RegsUsed;
SmallVector<OperandMask, 8> OperandMasks;
CandidateList Candidates;
const MCPhysReg *CSRegs;
// Returns bank for a phys reg.
unsigned getPhysRegBank(Register Reg, unsigned SubReg) const;
// Return a bit set for each register bank used. 4 banks for VGPRs and
// 8 banks for SGPRs.
// Registers already processed and recorded in RegsUsed are excluded.
// If Bank is not -1 assume Reg:SubReg to belong to that Bank.
uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank);
// Analyze one instruction returning the number of stalls and a mask of the
// banks used by all operands.
// If Reg and Bank are provided, assume all uses of Reg will be replaced with
// a register chosen from Bank.
std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
Register Reg = Register(),
unsigned SubReg = 0, int Bank = -1);
// Return true if register is regular VGPR or SGPR or their tuples.
// Returns false for special registers like m0, vcc etc.
bool isReassignable(Register Reg) const;
// Check if registers' defs are old and may be pre-loaded.
// Returns 0 if both registers are old enough, 1 or 2 if one or both
// registers will not likely be pre-loaded.
unsigned getOperandGatherWeight(const MachineInstr& MI,
Register Reg1,
Register Reg2,
unsigned StallCycles) const;
// Find all bank bits in UsedBanks where Mask can be relocated to.
unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
// Find all bank bits in UsedBanks where Mask can be relocated to.
// Bank is relative to the register and not its subregister component.
// Returns 0 is a register is not reassignable.
unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask,
unsigned UsedBanks) const;
// Add cadidate instruction to the work list.
void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
unsigned StallCycles);
// Collect cadidate instructions across function. Returns a number stall
// cycles detected. Only counts stalls if Collect is false.
unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
// Remove all candidates that read specified register.
void removeCandidates(Register Reg);
// Compute stalls within the uses of SrcReg replaced by a register from
// Bank. If Bank is -1 does not perform substitution. If Collect is set
// candidates are collected and added to work list.
unsigned computeStallCycles(Register SrcReg,
Register Reg = Register(),
unsigned SubReg = 0, int Bank = -1,
bool Collect = false);
// Search for a register in Bank unused within LI.
// Returns phys reg or NoRegister.
MCRegister scavengeReg(LiveInterval &LI, unsigned Bank,
unsigned SubReg) const;
// Try to reassign candidate. Returns number or stall cycles saved.
unsigned tryReassign(Candidate &C);
bool verifyCycles(MachineFunction &MF,
unsigned OriginalCycles, unsigned CyclesSaved);
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
public:
Printable printReg(Register Reg, unsigned SubReg = 0) const {
return Printable([Reg, SubReg, this](raw_ostream &OS) {
if (Reg.isPhysical()) {
OS << llvm::printReg(Reg, TRI);
return;
}
if (!VRM->isAssignedReg(Reg))
OS << "<unassigned> " << llvm::printReg(Reg, TRI);
else
OS << llvm::printReg(Reg, TRI) << '('
<< llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
if (SubReg)
OS << ':' << TRI->getSubRegIndexName(SubReg);
});
}
static Printable printBank(unsigned Bank) {
return Printable([Bank](raw_ostream &OS) {
OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
});
}
static void dumpFreeBanks(unsigned FreeBanks) {
for (unsigned L = 0; L < NUM_BANKS; ++L)
if (FreeBanks & (1 << L))
dbgs() << printBank(L) << ' ';
}
#endif
};
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
false, false)
char GCNRegBankReassign::ID = 0;
char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
unsigned SubReg) const {
assert(Reg.isPhysical());
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
if (Size == 16)
Reg = TRI->get32BitRegister(Reg);
else if (Size > 32) {
if (SubReg) {
const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
Reg = TRI->getSubReg(Reg, SubReg);
if (TRI->getRegSizeInBits(*SubRC) > 32)
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
} else {
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
}
}
if (TRI->hasVGPRs(RC)) {
unsigned RegNo = Reg - AMDGPU::VGPR0;
return RegNo % NUM_VGPR_BANKS;
}
unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
}
uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
int Bank) {
if (Reg.isVirtual()) {
if (!VRM->isAssignedReg(Reg))
return 0;
Reg = VRM->getPhys(Reg);
if (!Reg)
return 0;
if (SubReg)
Reg = TRI->getSubReg(Reg, SubReg);
}
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
if (Size == 16) {
Reg = TRI->get32BitRegister(Reg);
Size = 1;
} else {
Size /= 32;
if (Size > 1)
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
}
if (TRI->hasVGPRs(RC)) {
// VGPRs have 4 banks assigned in a round-robin fashion.
unsigned RegNo = Reg - AMDGPU::VGPR0;
uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
unsigned Used = 0;
// Bitmask lacks an extract method
for (unsigned I = 0; I < Size; ++I)
if (RegsUsed.test(RegNo + I))
Used |= 1 << I;
RegsUsed.set(RegNo, RegNo + Size);
Mask &= ~Used;
Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank);
return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
}
// SGPRs have 8 banks holding 2 consequitive registers each.
unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
if (RegNo + StartBit >= RegsUsed.size())
return 0;
if (Size > 1)
Size /= 2;
unsigned Mask = (1 << Size) - 1;
unsigned Used = 0;
for (unsigned I = 0; I < Size; ++I)
if (RegsUsed.test(StartBit + RegNo + I))
Used |= 1 << I;
RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size);
Mask &= ~Used;
Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS
: unsigned(Bank - SGPR_BANK_OFFSET);
Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
// Reserve 4 bank ids for VGPRs.
return Mask << SGPR_BANK_OFFSET;
}
std::pair<unsigned, unsigned>
GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg,
unsigned SubReg, int Bank) {
unsigned StallCycles = 0;
unsigned UsedBanks = 0;
if (MI.isMetaInstruction())
return std::make_pair(StallCycles, UsedBanks);
if (!(Mode & RM_SGPR) &&
MI.getDesc().TSFlags & (SIInstrFlags::SMRD | SIInstrFlags::SALU))
return std::make_pair(StallCycles, UsedBanks);
RegsUsed.reset();
OperandMasks.clear();
for (const auto& Op : MI.explicit_uses()) {
// Undef can be assigned to any register, so two vregs can be assigned
// the same phys reg within the same instruction.
if (!Op.isReg() || Op.isUndef())
continue;
const Register R = Op.getReg();
const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
// Do not compute stalls for AGPRs
if (TRI->hasAGPRs(RC))
continue;
if ((Mode != RM_BOTH) && !(Mode & (TRI->hasVGPRs(RC) ? RM_VGPR : RM_SGPR)))
continue;
// Do not compute stalls if sub-register covers all banks
if (Op.getSubReg()) {
LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
if (TRI->hasVGPRs(RC)) {
if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
continue;
} else {
if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
continue;
}
}
unsigned ShiftedBank = Bank;
if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
unsigned RegOffset =
TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
unsigned Offset = TRI->getChannelFromSubReg(
Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
if (Bank < NUM_VGPR_BANKS) {
unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
} else if (Bank >= SGPR_BANK_OFFSET) {
unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
ShiftedBank = SGPR_BANK_OFFSET +
(Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
}
}
uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
(Reg == R) ? ShiftedBank : -1);
StallCycles += countPopulation(UsedBanks & Mask);
UsedBanks |= Mask;
OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
}
return std::make_pair(StallCycles, UsedBanks);
}
unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
Register Reg1,
Register Reg2,
unsigned StallCycles) const
{
unsigned Defs = 0;
MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
if (MI.isDebugInstr())
continue;
--Def;
if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
continue;
if (Def->modifiesRegister(Reg1, TRI))
Defs |= 1;
if (Def->modifiesRegister(Reg2, TRI))
Defs |= 2;
}
return countPopulation(Defs);
}
bool GCNRegBankReassign::isReassignable(Register Reg) const {
if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
return false;
// InlineSpiller does not call LRM::assign() after an LI split leaving it
// in an inconsistent state, so we cannot call LRM::unassign().
// See llvm bug #48911.
// Skip reassign if a register has originated from such split.
// FIXME: Remove the workaround when bug #48911 is fixed.
if (VRM->getPreSplitReg(Reg))
return false;
const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
Register PhysReg = VRM->getPhys(Reg);
if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
return false;
for (auto U : MRI->use_nodbg_operands(Reg)) {
if (U.isImplicit())
return false;
const MachineInstr *UseInst = U.getParent();
if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
return false;
}
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
unsigned Size = TRI->getRegSizeInBits(*RC);
// TODO: Support 16 bit registers. Those needs to be moved with their
// parent VGPR_32 and potentially a sibling 16 bit sub-register.
if (Size < 32)
return false;
if (TRI->hasVGPRs(RC))
return true;
if (Size == 16)
return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
if (Size > 32)
PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
return AMDGPU::SGPR_32RegClass.contains(PhysReg);
}
unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
unsigned UsedBanks) const {
unsigned Size = countPopulation(Mask);
unsigned FreeBanks = 0;
unsigned Bank = findFirstSet(Mask);
UsedBanks &= ~Mask;
// Find free VGPR banks
if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
if (Bank == I)
continue;
unsigned NewMask = ((1 << Size) - 1) << I;
NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
if (!(UsedBanks & NewMask))
FreeBanks |= 1 << I;
}
return FreeBanks;
}
// Find free SGPR banks
// SGPR tuples must be aligned, so step is size in banks it
// crosses.
Bank -= SGPR_BANK_OFFSET;
for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
if (Bank == I)
continue;
unsigned NewMask = ((1 << Size) - 1) << I;
NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
}
return FreeBanks;
}
unsigned GCNRegBankReassign::getFreeBanks(Register Reg,
unsigned SubReg,
unsigned Mask,
unsigned UsedBanks) const {
if (!isReassignable(Reg))
return 0;
unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
unsigned Offset = TRI->getChannelFromSubReg(SubReg);
if (Offset && (Mask & VGPR_BANK_MASK)) {
unsigned Shift = Offset;
if (Shift >= NUM_VGPR_BANKS)
return 0;
unsigned VB = FreeBanks & VGPR_BANK_MASK;
FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
VGPR_BANK_MASK;
} else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
unsigned Shift = Offset >> 1;
if (Shift >= NUM_SGPR_BANKS)
return 0;
unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
SGPR_BANK_SHIFTED_MASK;
FreeBanks <<= SGPR_BANK_OFFSET;
}
LLVM_DEBUG(if (FreeBanks) {
dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
<< " to banks: "; dumpFreeBanks(FreeBanks);
dbgs() << '\n'; });
return FreeBanks;
}
void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
unsigned UsedBanks,
unsigned StallCycles) {
LLVM_DEBUG(MI.dump());
if (!StallCycles)
return;
LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
for (unsigned J = I + 1; J != E; ++J) {
if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
continue;
Register Reg1 = OperandMasks[I].Reg;
Register Reg2 = OperandMasks[J].Reg;
unsigned SubReg1 = OperandMasks[I].SubReg;
unsigned SubReg2 = OperandMasks[J].SubReg;
unsigned Mask1 = OperandMasks[I].Mask;
unsigned Mask2 = OperandMasks[J].Mask;
unsigned Size1 = countPopulation(Mask1);
unsigned Size2 = countPopulation(Mask2);
LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
" and " << printReg(Reg2, SubReg2) << '\n');
unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
Weight += MLI->getLoopDepth(MI.getParent()) * 10;
LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
if (FreeBanks1)
Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
Candidate(&MI, Reg1, SubReg1, FreeBanks1));
if (FreeBanks2)
Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
Candidate(&MI, Reg2, SubReg2, FreeBanks2));
}
}
}
unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg,
unsigned SubReg, int Bank,
bool Collect) {
unsigned TotalStallCycles = 0;
SmallSet<const MachineInstr *, 16> Visited;
for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
if (MI.isBundle())
continue;
if (!Visited.insert(&MI).second)
continue;
unsigned StallCycles;
unsigned UsedBanks;
std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
TotalStallCycles += StallCycles;
if (Collect)
collectCandidates(MI, UsedBanks, StallCycles);
}
return TotalStallCycles;
}
MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
unsigned SubReg) const {
const TargetRegisterClass *RC = MRI->getRegClass(LI.reg());
unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
: MaxNumSGPRs;
unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
: AMDGPU::SGPR0);
for (MCRegister Reg : RC->getRegisters()) {
// Check occupancy limit.
if (TRI->isSubRegisterEq(Reg, MaxReg))
break;
if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
continue;
for (unsigned I = 0; CSRegs[I]; ++I)
if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
!LRM->isPhysRegUsed(CSRegs[I]))
return MCRegister::from(AMDGPU::NoRegister);
LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
if (!LRM->checkInterference(LI, Reg))
return Reg;
}
return MCRegister::from(AMDGPU::NoRegister);
}
unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
if (!LIS->hasInterval(C.Reg))
return 0;
LiveInterval &LI = LIS->getInterval(C.Reg);
LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
LI.dump());
// For each candidate bank walk all instructions in the range of live
// interval and check if replacing the register with one belonging to
// the candidate bank reduces conflicts.
unsigned OrigStalls = computeStallCycles(C.Reg);
LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
if (!OrigStalls)
return 0;
struct BankStall {
BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
bool operator<(const BankStall &RHS) const {
if (Stalls == RHS.Stalls)
return Bank < RHS.Bank;
return Stalls > RHS.Stalls;
}
unsigned Bank;
unsigned Stalls;
};
SmallVector<BankStall, 8> BankStalls;
for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
if (C.FreeBanks & (1 << Bank)) {
LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
if (Stalls < OrigStalls) {
LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
<< Stalls << '\n');
BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
}
}
}
llvm::sort(BankStalls);
MCRegister OrigReg = VRM->getPhys(C.Reg);
LRM->unassign(LI);
while (!BankStalls.empty()) {
BankStall BS = BankStalls.pop_back_val();
MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg);
if (Reg == AMDGPU::NoRegister) {
LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
<< '\n');
continue;
}
LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
<< (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
<< " in bank " << printBank(BS.Bank) << '\n');
LRM->assign(LI, Reg);
LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
return OrigStalls - BS.Stalls;
}
LRM->assign(LI, OrigReg);
return 0;
}
unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
bool Collect) {
unsigned TotalStallCycles = 0;
for (MachineBasicBlock &MBB : MF) {
LLVM_DEBUG(if (Collect) {
if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
else dbgs() << MBB.getName(); dbgs() << ":\n";
});
for (MachineInstr &MI : MBB.instrs()) {
if (MI.isBundle())
continue; // we analyze the instructions inside the bundle individually
unsigned StallCycles;
unsigned UsedBanks;
std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
if (Collect)
collectCandidates(MI, UsedBanks, StallCycles);
TotalStallCycles += StallCycles;
}
LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
}
return TotalStallCycles;
}
void GCNRegBankReassign::removeCandidates(Register Reg) {
typename CandidateList::iterator Next;
for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
Next = std::next(I);
I->second.remove_if([Reg, this](const Candidate& C) {
return C.MI->readsRegister(Reg, TRI);
});
if (I->second.empty())
Candidates.erase(I);
}
}
bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
unsigned OriginalCycles,
unsigned CyclesSaved) {
unsigned StallCycles = collectCandidates(MF, false);
LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
<< " stall cycles left\n");
return StallCycles + CyclesSaved == OriginalCycles;
}
bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
return false;
MRI = &MF.getRegInfo();
LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function "
<< MF.getName() << '\n'
<< ((Mode & RM_VGPR) ? "VGPR " : "")
<< ((Mode & RM_SGPR) ? "SGPR " : "") << "mode\n"
<< "NumVirtRegs = " << MRI->getNumVirtRegs() << "\n\n");
if (MRI->getNumVirtRegs() > VRegThresh) {
LLVM_DEBUG(dbgs() << "NumVirtRegs > " << VRegThresh
<< " threshold, skipping function.\n\n");
return false;
}
TRI = ST->getRegisterInfo();
MLI = &getAnalysis<MachineLoopInfo>();
VRM = &getAnalysis<VirtRegMap>();
LRM = &getAnalysis<LiveRegMatrix>();
LIS = &getAnalysis<LiveIntervals>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Occupancy = MFI->getOccupancy();
MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
CSRegs = MRI->getCalleeSavedRegs();
unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
// Not a tight bound
AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
RegsUsed.resize(NumRegBanks);
unsigned StallCycles = collectCandidates(MF);
NumStallsDetected += StallCycles;
LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
"function " << MF.getName() << '\n');
LLVM_DEBUG(Candidates.dump(this));
unsigned CyclesSaved = 0;
while (!Candidates.empty()) {
Candidate C = Candidates.back();
unsigned LocalCyclesSaved = tryReassign(C);
CyclesSaved += LocalCyclesSaved;
if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
report_fatal_error("RegBank reassign stall cycles verification failed.");
Candidates.pop_back();
if (LocalCyclesSaved) {
removeCandidates(C.Reg);
computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
LLVM_DEBUG(Candidates.dump(this));
}
}
NumStallsRecovered += CyclesSaved;
LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
<< " cycles saved in function " << MF.getName() << '\n');
Candidates.clear();
if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
report_fatal_error("RegBank reassign stall cycles verification failed.");
RegsUsed.clear();
return CyclesSaved > 0;
}
MachineFunctionPass *
llvm::createGCNRegBankReassignPass(RegBankReassignMode Mode) {
return new GCNRegBankReassign(Mode);
}

View File

@ -1643,12 +1643,8 @@ define <2 x i64> @v_ashr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v7, v2
; GFX10-NEXT: v_mov_b32_e32 v8, v3
; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[10:11]
; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[7:8]
; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = ashr <2 x i64> %value, %amount
ret <2 x i64> %result

View File

@ -314,45 +314,45 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
; GFX10-NEXT: v_add_nc_u32_e32 v19, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v19
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
; GFX10-NEXT: v_cndmask_b32_e64 v23, v4, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v5, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v19
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v27, v18, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v7, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v8, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19
; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v8, vcc_lo
; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48
; GFX10-NEXT: v_cndmask_b32_e32 v5, v27, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v9, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v10, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v19
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v11, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v12, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v19
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
; GFX10-NEXT: v_cndmask_b32_e64 v6, v3, v13, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v14, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v14, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v19
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v16, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v16, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v19
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc_lo
@ -577,54 +577,54 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(
;
; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_cndmask_b32_e32 v4, s8, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, s9, v3, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, s8, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, v3, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 4, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s14
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s14
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s18, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s19, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s20, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s21, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s20, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s21, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s23, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s22, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s22, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s23, s0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s2, v2

View File

@ -581,9 +581,9 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(<8 x i16> addrspace(1)* %ptr, i32
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2

View File

@ -223,9 +223,9 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX10-NEXT: v_and_b32_sdwa v4, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v6, v0, s4, v1
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX10-NEXT: v_and_b32_e32 v1, 3, v2
; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v4, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -1036,12 +1036,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v3
; GFX10-NEXT: s_lshr_b32 s0, s2, 2
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5
; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5
; GFX10-NEXT: s_and_b32 s0, s2, 3
; GFX10-NEXT: s_lshl_b32 s0, s0, 3
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@ -2613,25 +2613,25 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2
; GFX10-NEXT: v_and_b32_sdwa v13, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v3
; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v3
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_and_or_b32 v23, v1, s1, v8
; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v8
; GFX10-NEXT: s_lshr_b32 s0, s2, 2
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v17, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v15, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GFX10-NEXT: v_and_or_b32 v2, v2, s1, v19
; GFX10-NEXT: v_and_or_b32 v2, v2, s1, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v13, v7
; GFX10-NEXT: v_or3_b32 v1, v23, v14, v9
; GFX10-NEXT: v_or3_b32 v1, v1, v14, v9
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
; GFX10-NEXT: v_and_or_b32 v5, v3, v4, v5
; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_or3_b32 v2, v2, v17, v11
; GFX10-NEXT: v_or3_b32 v2, v2, v15, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v6
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2

View File

@ -186,9 +186,9 @@ define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8
@ -227,9 +227,9 @@ define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel)
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6
@ -346,20 +346,20 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
; GFX10-NEXT: v_mov_b32_e32 v2, s7
; GFX10-NEXT: s_mov_b64 s[4:5], 1
; GFX10-NEXT: s_mov_b64 s[8:9], 3
; GFX10-NEXT: s_mov_b64 s[14:15], 4
; GFX10-NEXT: s_mov_b64 s[10:11], 4
; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, s5, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
; GFX10-NEXT: s_mov_b64 s[12:13], 5
; GFX10-NEXT: s_mov_b64 s[14:15], 6
; GFX10-NEXT: s_mov_b64 s[16:17], 7
; GFX10-NEXT: s_mov_b64 s[18:19], 8
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
; GFX10-NEXT: s_mov_b64 s[14:15], 6
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
@ -561,11 +561,11 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s46, s12
; GFX10-NEXT: s_mov_b32 s10, s12
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: s_mov_b32 s47, s13
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
; GFX10-NEXT: s_mov_b32 s13, s15
; GFX10-NEXT: s_mov_b32 s14, s16
@ -576,8 +576,8 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
@ -624,23 +624,23 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
@ -860,9 +860,9 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8
@ -1360,23 +1360,23 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_nc_u32_e32 v16, 3, v16
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
@ -1416,9 +1416,9 @@ define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 %
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8
@ -1530,23 +1530,23 @@ define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
@ -2001,9 +2001,9 @@ define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v6
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v6
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v6
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v6
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -2034,9 +2034,9 @@ define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel)
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: ; return to shader part epilog
@ -2162,9 +2162,9 @@ define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v7
@ -2199,9 +2199,9 @@ define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel)
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6
@ -2311,19 +2311,19 @@ define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s14, s12
; GFX10-NEXT: s_mov_b32 s10, s12
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: s_mov_b32 s47, s13
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s47, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s11, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
@ -2358,17 +2358,17 @@ define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
@ -2520,11 +2520,11 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: s_mov_b32 s8, s10
; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s46, s12
; GFX10-NEXT: s_mov_b32 s10, s12
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: s_mov_b32 s47, s13
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
; GFX10-NEXT: s_mov_b32 s13, s15
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
@ -2533,8 +2533,8 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo
@ -2575,23 +2575,23 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v14
; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v14
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v14
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
%ext = extractelement <7 x double> %vec, i32 %sel
@ -3168,8 +3168,8 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; GFX10-NEXT: s_mov_b32 s9, s11
; GFX10-NEXT: s_mov_b32 s46, s12
; GFX10-NEXT: s_mov_b32 s47, s13
; GFX10-NEXT: s_mov_b32 s10, s12
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: s_mov_b32 s12, s14
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
@ -3187,9 +3187,9 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s47, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0
@ -3245,25 +3245,25 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15
@ -3476,25 +3476,25 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15

View File

@ -637,9 +637,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
@ -849,9 +849,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
@ -1515,9 +1515,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1

View File

@ -712,27 +712,27 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0
; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1
; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0
; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5
; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLUSH-LABEL: v_fdiv_v2f32:
@ -752,18 +752,18 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7
; GFX10-FLUSH-NEXT: v_div_scale_f32 v11, vcc_lo, v1, v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1
; GFX10-FLUSH-NEXT: s_denorm_mode 3
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v11, v6
; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v11
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v6
; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v2
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v11, -v4, v5
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v2, -v4, v5
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v11, v6, v5
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v5
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> %a, %b
@ -874,27 +874,27 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0
; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1
; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0
; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5
; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLUSH-LABEL: v_fdiv_v2f32_ulp25:
@ -905,16 +905,16 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> %a, %b, !fpmath !0
ret <2 x float> %fdiv
@ -1044,25 +1044,25 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0
; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0
; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0
; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
@ -1226,25 +1226,25 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0
; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0
; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0
; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
@ -1465,27 +1465,27 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0
; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1
; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0
; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5
; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0
; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
@ -1496,16 +1496,16 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
ret <2 x float> %fdiv

View File

@ -105,10 +105,10 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11]
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn double %a, %b
ret double %fdiv
@ -355,9 +355,9 @@ define double @v_rcp_f64_arcp_afn(double %x) {
; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
; GFX10-NEXT: v_mul_f64 v[6:7], 1.0, v[2:3]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], 1.0
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
; GFX10-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp afn double 1.0, %x
ret double %fdiv
@ -458,10 +458,10 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11]
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn double %a, %b, !fpmath !0
ret double %fdiv
@ -634,33 +634,29 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v30, v4
; GFX10-NEXT: v_mov_b32_e32 v31, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: v_mov_b32_e32 v5, v7
; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3]
; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27]
; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7]
; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13]
; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17]
; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17]
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1]
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3]
; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x double> %a, %b
ret <2 x double> %fdiv
@ -692,30 +688,22 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v18, v4
; GFX10-NEXT: v_mov_b32_e32 v19, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: v_mov_b32_e32 v5, v7
; GFX10-NEXT: v_mov_b32_e32 v22, v0
; GFX10-NEXT: v_mov_b32_e32 v23, v1
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9]
; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11]
; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1]
; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13]
; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15]
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x double> %a, %b
ret <2 x double> %fdiv
@ -816,33 +804,29 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v30, v4
; GFX10-NEXT: v_mov_b32_e32 v31, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: v_mov_b32_e32 v5, v7
; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3]
; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27]
; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7]
; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13]
; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17]
; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17]
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1]
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3]
; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x double> %a, %b, !fpmath !0
ret <2 x double> %fdiv
@ -943,29 +927,29 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0
; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0
; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7]
; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9]
; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13]
; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0
; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x double> <double 1.0, double 1.0>, %x
ret <2 x double> %fdiv
@ -1066,29 +1050,29 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0
; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0
; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7]
; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9]
; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13]
; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0
; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x double> <double 1.0, double 1.0>, %x
ret <2 x double> %fdiv
@ -1120,26 +1104,22 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v14, v0
; GFX10-NEXT: v_mov_b32_e32 v15, v1
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[14:15]
; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[0:1]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[0:1]
; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[2:3]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
; GFX10-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5]
; GFX10-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7]
; GFX10-NEXT: v_fma_f64 v[14:15], -v[14:15], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], -v[0:1], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp afn <2 x double> <double 1.0, double 1.0>, %x
ret <2 x double> %fdiv
@ -1240,29 +1220,29 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0
; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0
; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11]
; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7]
; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9]
; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13]
; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13]
; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0
; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x double> <double 1.0, double 1.0>, %x, !fpmath !0
ret <2 x double> %fdiv
@ -1294,30 +1274,22 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v18, v4
; GFX10-NEXT: v_mov_b32_e32 v19, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: v_mov_b32_e32 v5, v7
; GFX10-NEXT: v_mov_b32_e32 v22, v0
; GFX10-NEXT: v_mov_b32_e32 v23, v1
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9]
; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11]
; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1]
; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13]
; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15]
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0
ret <2 x double> %fdiv
@ -1418,33 +1390,29 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v30, v4
; GFX10-NEXT: v_mov_b32_e32 v31, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: v_mov_b32_e32 v5, v7
; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3]
; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27]
; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15]
; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7]
; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9]
; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0
; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13]
; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3]
; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13]
; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21]
; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17]
; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17]
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19]
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1]
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3]
; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0
ret <2 x double> %fdiv
@ -1476,30 +1444,22 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v18, v4
; GFX10-NEXT: v_mov_b32_e32 v19, v5
; GFX10-NEXT: v_mov_b32_e32 v4, v6
; GFX10-NEXT: v_mov_b32_e32 v5, v7
; GFX10-NEXT: v_mov_b32_e32 v22, v0
; GFX10-NEXT: v_mov_b32_e32 v23, v1
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11]
; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9]
; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11]
; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23]
; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1]
; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13]
; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15]
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11]
; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9]
; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0
ret <2 x double> %fdiv

View File

@ -479,12 +479,8 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v14, v0
; GFX10-NEXT: v_mov_b32_e32 v15, v1
; GFX10-NEXT: v_mov_b32_e32 v12, v2
; GFX10-NEXT: v_mov_b32_e32 v13, v3
; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z)
ret <2 x double> %fma

View File

@ -1159,7 +1159,6 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: v_and_b32_e32 v11, 7, v2
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
@ -1167,13 +1166,14 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8
; GFX10-NEXT: v_and_b32_e32 v8, 7, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v15, 0xff
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-NEXT: v_mov_b32_e32 v13, 0xff
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT: v_and_b32_e32 v12, s4, v1
; GFX10-NEXT: v_and_b32_e32 v6, s4, v6
; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2
; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
@ -2190,13 +2190,13 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24
; GFX10-NEXT: s_sub_i32 s4, 0, 24
; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff
; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7
; GFX10-NEXT: v_and_b32_e32 v5, v5, v12
; GFX10-NEXT: v_and_b32_e32 v2, v2, v12
; GFX10-NEXT: v_and_b32_e32 v3, v3, v12
; GFX10-NEXT: v_and_b32_e32 v5, v5, v10
; GFX10-NEXT: v_and_b32_e32 v2, v2, v10
; GFX10-NEXT: v_and_b32_e32 v3, v3, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
@ -2224,19 +2224,19 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5
; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v15
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v11, v6, v12
; GFX10-NEXT: v_and_b32_e32 v4, v4, v10
; GFX10-NEXT: v_and_b32_e32 v6, v6, v10
; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5
; GFX10-NEXT: v_and_b32_e32 v10, v5, v12
; GFX10-NEXT: v_lshrrev_b32_e32 v2, v11, v2
; GFX10-NEXT: v_and_b32_e32 v6, v7, v12
; GFX10-NEXT: v_and_b32_e32 v7, v15, v12
; GFX10-NEXT: v_lshrrev_b32_e32 v3, v6, v3
; GFX10-NEXT: v_lshl_or_b32 v0, v0, v7, v2
; GFX10-NEXT: v_lshl_or_b32 v1, v1, v10, v3
; GFX10-NEXT: v_and_b32_e32 v5, v5, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v7, v7, v10
; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
ret <2 x i24> %result
@ -2617,13 +2617,13 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 1, v0
; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5
; GFX10-NEXT: v_alignbit_b32 v0, v7, v2, v4
; GFX10-NEXT: v_alignbit_b32 v1, v6, v3, v5
; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt)
ret <2 x i32> %result
@ -2770,22 +2770,22 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_alignbit_b32 v22, v1, v5, 1
; GFX10-NEXT: v_alignbit_b32 v18, v0, v4, 1
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 1, v0
; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1
; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1
; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1
; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v19, 1, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9
; GFX10-NEXT: v_alignbit_b32 v5, v2, v6, 1
; GFX10-NEXT: v_lshrrev_b32_e32 v23, 1, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10
; GFX10-NEXT: v_alignbit_b32 v13, v3, v7, 1
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 1, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11
; GFX10-NEXT: v_alignbit_b32 v0, v15, v18, v8
; GFX10-NEXT: v_alignbit_b32 v1, v19, v22, v9
; GFX10-NEXT: v_alignbit_b32 v2, v23, v5, v10
; GFX10-NEXT: v_alignbit_b32 v3, v14, v13, v11
; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8
; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9
; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10
; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt)
ret <4 x i32> %result
@ -4176,15 +4176,15 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5
; GFX10-NEXT: s_mov_b32 s4, 0xf000f
; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_and_b32_e32 v11, s4, v4
; GFX10-NEXT: v_and_b32_e32 v15, s4, v6
; GFX10-NEXT: v_and_b32_e32 v19, s4, v5
; GFX10-NEXT: v_and_b32_e32 v6, s4, v7
; GFX10-NEXT: v_and_b32_e32 v4, s4, v4
; GFX10-NEXT: v_and_b32_e32 v6, s4, v6
; GFX10-NEXT: v_and_b32_e32 v5, s4, v5
; GFX10-NEXT: v_and_b32_e32 v7, s4, v7
; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v11, v0
; GFX10-NEXT: v_pk_lshrrev_b16 v2, v15, v2
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v19, v1
; GFX10-NEXT: v_pk_lshrrev_b16 v3, v6, v3
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0
; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1
; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -4290,9 +4290,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4
; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; GFX10-NEXT: v_and_b32_e32 v7, 63, v4
; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
@ -4703,18 +4703,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10
; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX10-NEXT: v_and_b32_e32 v15, 63, v8
; GFX10-NEXT: v_and_b32_e32 v19, 63, v9
; GFX10-NEXT: v_and_b32_e32 v13, 63, v11
; GFX10-NEXT: v_and_b32_e32 v9, 63, v10
; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v11, v4
; GFX10-NEXT: v_or_b32_e32 v1, v12, v5
; GFX10-NEXT: v_or_b32_e32 v2, v15, v6
; GFX10-NEXT: v_or_b32_e32 v3, v16, v7
; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
ret <2 x i64> %result
@ -5178,16 +5178,14 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8
; GFX10-NEXT: s_movk_i32 s4, 0x7f
; GFX10-NEXT: v_mov_b32_e32 v27, v2
; GFX10-NEXT: v_and_b32_e32 v18, s4, v8
; GFX10-NEXT: v_mov_b32_e32 v28, v3
; GFX10-NEXT: v_and_b32_e32 v19, s4, v9
; GFX10-NEXT: s_sub_i32 s4, 64, 1
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[6:7]
; GFX10-NEXT: s_sub_i32 s4, 1, 64
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_lshrrev_b64 v[15:16], s4, v[6:7]
; GFX10-NEXT: v_lshrrev_b64 v[12:13], s4, v[6:7]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 1, 0
; GFX10-NEXT: v_or_b32_e32 v8, v8, v10
@ -5197,48 +5195,48 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: v_sub_nc_u32_e32 v14, 64, v18
; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v9, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
; GFX10-NEXT: v_lshrrev_b64 v[8:9], v14, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[27:28]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3]
; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19
; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, v7, s4
; GFX10-NEXT: v_subrev_nc_u32_e32 v23, 64, v18
; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v7, s4
; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18
; GFX10-NEXT: v_or_b32_e32 v10, v8, v10
; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19
; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[21:22]
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
; GFX10-NEXT: v_or_b32_e32 v11, v9, v11
; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[21:22]
; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v14, v14, v16
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19
; GFX10-NEXT: v_or_b32_e32 v15, v15, v17
; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[21:22]
; GFX10-NEXT: v_cndmask_b32_e64 v23, v8, v14, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18
; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19
; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s4
; GFX10-NEXT: v_cndmask_b32_e32 v31, 0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v4, s5
; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v27, s6
; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v28, s6
; GFX10-NEXT: v_or_b32_e32 v0, v31, v4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6
; GFX10-NEXT: v_or_b32_e32 v0, v12, v4
; GFX10-NEXT: v_or_b32_e32 v1, v7, v5
; GFX10-NEXT: v_or_b32_e32 v2, v15, v6
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
; GFX10-NEXT: v_or_b32_e32 v3, v3, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@ -5473,7 +5471,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], v0, s[8:9]
; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1]
; GFX10-NEXT: v_or_b32_e32 v3, v1, v3
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[8:9]
@ -5481,25 +5479,25 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX10-NEXT: v_or_b32_e32 v6, v6, v8
; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13
; GFX10-NEXT: v_or_b32_e32 v7, v7, v9
; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[8:9]
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0
; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s2, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s6, s1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s1
; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s3, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0
; GFX10-NEXT: v_or_b32_e32 v0, v11, v0
; GFX10-NEXT: v_or_b32_e32 v1, v15, v1
; GFX10-NEXT: v_or_b32_e32 v2, v19, v2
; GFX10-NEXT: v_or_b32_e32 v3, v6, v3
; GFX10-NEXT: v_or_b32_e32 v0, v4, v0
; GFX10-NEXT: v_or_b32_e32 v1, v5, v1
; GFX10-NEXT: v_or_b32_e32 v2, v6, v2
; GFX10-NEXT: v_or_b32_e32 v3, v7, v3
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@ -5756,7 +5754,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3]
; GFX10-NEXT: s_sub_i32 s0, 1, 64
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 1, 0
; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
@ -5765,12 +5763,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s0, 1, s0
; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0
; GFX10-NEXT: s_sub_i32 s0, 64, s4
@ -5778,7 +5776,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3]
; GFX10-NEXT: s_sub_i32 s0, s4, 64
; GFX10-NEXT: s_cmp_lt_u32 s4, 64
; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
@ -5787,12 +5785,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s0, 1, s0
; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3]
; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0
; GFX10-NEXT: v_or_b32_e32 v0, s8, v0
@ -6025,7 +6023,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1]
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_lshlrev_b64 v[11:12], s5, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s12, 1, s6
; GFX10-NEXT: s_sub_i32 s13, 1, 64
@ -6045,10 +6043,10 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s14, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12
@ -6419,7 +6417,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_movk_i32 s4, 0x41
; GFX10-NEXT: v_lshrrev_b32_e32 v19, 31, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 31, v5
; GFX10-NEXT: s_sub_i32 s5, 64, s4
; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1]
@ -6431,39 +6429,39 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_sub_i32 s5, 64, 63
; GFX10-NEXT: v_or_b32_e32 v15, v9, v11
; GFX10-NEXT: v_or_b32_e32 v14, v8, v10
; GFX10-NEXT: v_or_b32_e32 v15, v9, v11
; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[6:7]
; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s7, 1, s4
; GFX10-NEXT: s_sub_i32 s4, 63, 64
; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6
; GFX10-NEXT: v_lshrrev_b64 v[23:24], s4, v[6:7]
; GFX10-NEXT: v_lshrrev_b64 v[10:11], s4, v[6:7]
; GFX10-NEXT: s_cmp_lt_u32 63, 64
; GFX10-NEXT: v_or_b32_e32 v6, v19, v8
; GFX10-NEXT: v_or_b32_e32 v6, v16, v8
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 63, 0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s7
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v6, v23, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s4
; GFX10-NEXT: s_and_b32 s5, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v8, v24, v9, s4
; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v9, s4
; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5
; GFX10-NEXT: v_cndmask_b32_e64 v19, v0, v2, s6
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, v2, s6
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v7
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v4, s5
; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v5, s5
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v3, s6
; GFX10-NEXT: v_or_b32_e32 v0, v11, v4
; GFX10-NEXT: v_or_b32_e32 v1, v27, v5
; GFX10-NEXT: v_or_b32_e32 v2, v19, v6
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s6
; GFX10-NEXT: v_or_b32_e32 v0, v12, v4
; GFX10-NEXT: v_or_b32_e32 v1, v13, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
ret i128 %result
@ -6810,21 +6808,19 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10: ; %bb.0:
; GFX10-NEXT: s_movk_i32 s18, 0x7f
; GFX10-NEXT: s_mov_b32 s19, 0
; GFX10-NEXT: s_mov_b32 s30, s0
; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19]
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX10-NEXT: s_sub_i32 s17, s22, 64
; GFX10-NEXT: s_sub_i32 s23, 64, s22
; GFX10-NEXT: s_cmp_lt_u32 s22, 64
; GFX10-NEXT: s_mov_b32 s31, s1
; GFX10-NEXT: s_cselect_b32 s28, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s22, 0
; GFX10-NEXT: s_cselect_b32 s29, 1, 0
; GFX10-NEXT: s_lshr_b64 s[24:25], s[30:31], s23
; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s23
; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s22
; GFX10-NEXT: s_lshl_b64 s[22:23], s[30:31], s22
; GFX10-NEXT: s_lshl_b64 s[22:23], s[0:1], s22
; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
; GFX10-NEXT: s_lshl_b64 s[0:1], s[30:31], s17
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17
; GFX10-NEXT: s_cmp_lg_u32 s28, 0
; GFX10-NEXT: s_cselect_b64 s[22:23], s[22:23], 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
@ -6844,7 +6840,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
; GFX10-NEXT: s_cmp_lg_u32 s30, 0
; GFX10-NEXT: s_cselect_b64 s[46:47], s[8:9], s[0:1]
; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b64 s[8:9], s[26:27], 0
; GFX10-NEXT: s_sub_i32 s26, s16, 64
@ -6853,7 +6849,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_cselect_b32 s27, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s30, 1, 0
; GFX10-NEXT: s_lshr_b64 s[10:11], s[46:47], s16
; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16
; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17
; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25]
@ -6861,7 +6857,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_cmp_lg_u32 s27, 0
; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s30, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[46:47], s[8:9]
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s27, 0
; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21]
@ -7329,8 +7325,6 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: v_mov_b32_e32 v29, v2
; GFX10-NEXT: v_mov_b32_e32 v30, v3
; GFX10-NEXT: v_sub_nc_u32_e32 v23, 64, v27
; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v16, vcc_lo
@ -7338,20 +7332,20 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[29:30]
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3]
; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28
; GFX10-NEXT: v_cndmask_b32_e32 v34, v21, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v35, v22, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v8, v21, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v10, s4
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v11, s4
; GFX10-NEXT: v_or_b32_e32 v18, v16, v18
; GFX10-NEXT: v_subrev_nc_u32_e32 v31, 64, v27
; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27
; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28
; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[34:35]
; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v19, v17, v19
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v31, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27
; GFX10-NEXT: v_or_b32_e32 v23, v23, v25
@ -7363,77 +7357,77 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4
; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27
; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v34, s5
; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v35, s5
; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v8, s5
; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v9, s5
; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v3, s6
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, v1, s4
; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20
; GFX10-NEXT: v_or_b32_e32 v0, v21, v8
; GFX10-NEXT: v_or_b32_e32 v1, v11, v9
; GFX10-NEXT: v_or_b32_e32 v0, v21, v8
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], s8, v[14:15]
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27
; GFX10-NEXT: v_and_b32_e32 v27, s7, v16
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_and_b32_e32 v25, s7, v16
; GFX10-NEXT: v_and_b32_e32 v24, s7, v20
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 1, 0
; GFX10-NEXT: v_lshrrev_b64 v[16:17], s9, v[14:15]
; GFX10-NEXT: v_or_b32_e32 v11, v9, v11
; GFX10-NEXT: v_or_b32_e32 v10, v8, v10
; GFX10-NEXT: v_or_b32_e32 v11, v9, v11
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: v_and_b32_e32 v24, s7, v20
; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[14:15]
; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v30, s6
; GFX10-NEXT: v_cndmask_b32_e32 v19, v16, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v31, v17, v11, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v24
; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v29, s6
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v24
; GFX10-NEXT: v_lshrrev_b64 v[10:11], v18, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[14:15], v24, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25
; GFX10-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v9, s4
; GFX10-NEXT: v_sub_nc_u32_e32 v31, 64, v27
; GFX10-NEXT: v_lshrrev_b64 v[35:36], v18, v[4:5]
; GFX10-NEXT: v_or_b32_e32 v2, v2, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v27
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[18:19], v27, v[12:13]
; GFX10-NEXT: v_lshlrev_b64 v[20:21], v31, v[8:9]
; GFX10-NEXT: v_or_b32_e32 v14, v10, v14
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v25
; GFX10-NEXT: v_lshrrev_b64 v[18:19], v25, v[12:13]
; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[8:9]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24
; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
; GFX10-NEXT: v_or_b32_e32 v5, v36, v15
; GFX10-NEXT: v_or_b32_e32 v14, v35, v14
; GFX10-NEXT: v_or_b32_e32 v5, v11, v15
; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9]
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v25
; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v16, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v16, v18, v20
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v27
; GFX10-NEXT: v_or_b32_e32 v18, v19, v21
; GFX10-NEXT: v_cndmask_b32_e32 v31, v3, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v14, v3, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[3:4], v27, v[8:9]
; GFX10-NEXT: v_lshrrev_b64 v[3:4], v25, v[8:9]
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v24
; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v18, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v27
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v25
; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4
; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v17, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v14, v5, v7, s6
; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v4, s4
; GFX10-NEXT: v_cndmask_b32_e64 v19, v31, v6, s6
; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v6, s6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v10, v12, s5
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v3, s4
; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v13, s5
; GFX10-NEXT: v_or_b32_e32 v3, v22, v23
; GFX10-NEXT: v_or_b32_e32 v7, v14, v11
; GFX10-NEXT: v_or_b32_e32 v7, v7, v11
; GFX10-NEXT: v_or_b32_e32 v4, v15, v5
; GFX10-NEXT: v_or_b32_e32 v6, v19, v10
; GFX10-NEXT: v_or_b32_e32 v6, v6, v10
; GFX10-NEXT: v_or_b32_e32 v5, v9, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)

View File

@ -260,9 +260,9 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f
; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2
; GFX10-NEXT: v_and_b32_e32 v2, v2, v3
; GFX10-NEXT: v_and_b32_e32 v7, v4, v3
; GFX10-NEXT: v_and_b32_e32 v4, v4, v3
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt)
@ -1158,38 +1158,38 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6
; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-NEXT: v_and_b32_e32 v15, 7, v8
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v14, 7, v11
; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0
; GFX10-NEXT: v_mov_b32_e32 v15, 0xff
; GFX10-NEXT: v_lshlrev_b16 v3, v14, v3
; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12
; GFX10-NEXT: v_and_b32_e32 v11, 7, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX10-NEXT: v_and_b32_e32 v8, 7, v8
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_mov_b32_e32 v13, 0xff
; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1
; GFX10-NEXT: v_and_b32_e32 v8, s4, v1
; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_e32 v10, 7, v10
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
; GFX10-NEXT: v_and_b32_e32 v7, s4, v7
; GFX10-NEXT: v_and_b32_e32 v11, 7, v11
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v15, 7, v14
; GFX10-NEXT: v_and_b32_e32 v13, 7, v14
; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5
; GFX10-NEXT: v_and_b32_e32 v12, 7, v12
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7
; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4
; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1
; GFX10-NEXT: v_lshlrev_b16 v5, v15, v5
; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5
; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9
; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8
; GFX10-NEXT: v_or_b32_e32 v3, v3, v6
@ -2190,14 +2190,14 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24
; GFX10-NEXT: s_sub_i32 s4, 0, 24
; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff
; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v5, v5, v12
; GFX10-NEXT: v_and_b32_e32 v2, v2, v12
; GFX10-NEXT: v_and_b32_e32 v3, v3, v12
; GFX10-NEXT: v_and_b32_e32 v5, v5, v10
; GFX10-NEXT: v_and_b32_e32 v2, v2, v10
; GFX10-NEXT: v_and_b32_e32 v3, v3, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
@ -2224,18 +2224,18 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5
; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5
; GFX10-NEXT: v_and_b32_e32 v4, v11, v12
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX10-NEXT: v_and_b32_e32 v4, v4, v10
; GFX10-NEXT: v_and_b32_e32 v6, v6, v10
; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5
; GFX10-NEXT: v_and_b32_e32 v5, v5, v12
; GFX10-NEXT: v_and_b32_e32 v11, v6, v12
; GFX10-NEXT: v_and_b32_e32 v4, v7, v12
; GFX10-NEXT: v_and_b32_e32 v5, v5, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; GFX10-NEXT: v_and_b32_e32 v4, v7, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3
; GFX10-NEXT: v_lshl_or_b32 v0, v0, v11, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2
; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
@ -4424,9 +4424,9 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
; GFX10-NEXT: v_and_b32_e32 v7, 63, v5
; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -4833,18 +4833,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX10-NEXT: v_and_b32_e32 v19, 63, v8
; GFX10-NEXT: v_and_b32_e32 v15, 63, v9
; GFX10-NEXT: v_and_b32_e32 v9, 63, v11
; GFX10-NEXT: v_and_b32_e32 v13, 63, v10
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v11, v4
; GFX10-NEXT: v_or_b32_e32 v1, v12, v5
; GFX10-NEXT: v_or_b32_e32 v2, v15, v6
; GFX10-NEXT: v_or_b32_e32 v3, v16, v7
; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
ret <2 x i64> %result
@ -5317,46 +5317,44 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s6
; GFX10-NEXT: v_and_b32_e32 v19, s5, v15
; GFX10-NEXT: v_and_b32_e32 v20, s5, v8
; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v20, s5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v19
; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v20
; GFX10-NEXT: v_mov_b32_e32 v25, v4
; GFX10-NEXT: v_mov_b32_e32 v26, v5
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v19
; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v11, v[9:10]
; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1]
; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v20
; GFX10-NEXT: v_lshlrev_b64 v[13:14], v19, v[9:10]
; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v20
; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[9:10]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[9:10]
; GFX10-NEXT: v_lshrrev_b64 v[15:16], v20, v[25:26]
; GFX10-NEXT: v_lshrrev_b64 v[15:16], v20, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[17:18], v17, v[6:7]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19
; GFX10-NEXT: v_or_b32_e32 v10, v3, v12
; GFX10-NEXT: v_or_b32_e32 v11, v2, v11
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v21, v[6:7]
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v20
; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v13, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v11, v2, v11
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v20
; GFX10-NEXT: v_or_b32_e32 v10, v3, v12
; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v13, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v13, v15, v17
; GFX10-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v10, v16, v18
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v20, v[6:7]
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19
; GFX10-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v20
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v10, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v20
; GFX10-NEXT: v_cndmask_b32_e64 v9, v15, v1, s6
; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s6
; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v25, s5
; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v26, s5
; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s6
; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v1, s6
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v4, s5
; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v5, s5
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v7, s4
; GFX10-NEXT: v_or_b32_e32 v0, v23, v0
; GFX10-NEXT: v_or_b32_e32 v0, v12, v0
; GFX10-NEXT: v_or_b32_e32 v1, v10, v1
; GFX10-NEXT: v_or_b32_e32 v2, v8, v2
; GFX10-NEXT: v_or_b32_e32 v3, v9, v3
@ -5591,31 +5589,31 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v12
; GFX10-NEXT: v_or_b32_e32 v2, v0, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v12
; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[10:11]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[10:11]
; GFX10-NEXT: v_or_b32_e32 v3, v1, v3
; GFX10-NEXT: v_or_b32_e32 v6, v6, v8
; GFX10-NEXT: v_or_b32_e32 v7, v7, v9
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v13, s[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v12, s[6:7]
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0
; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s8, s2
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13
; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1
; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s8, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0
; GFX10-NEXT: v_or_b32_e32 v0, v11, v0
; GFX10-NEXT: v_or_b32_e32 v1, v15, v1
; GFX10-NEXT: v_or_b32_e32 v2, v19, v2
; GFX10-NEXT: v_or_b32_e32 v3, v6, v3
; GFX10-NEXT: v_or_b32_e32 v0, v4, v0
; GFX10-NEXT: v_or_b32_e32 v1, v5, v1
; GFX10-NEXT: v_or_b32_e32 v2, v6, v2
; GFX10-NEXT: v_or_b32_e32 v3, v7, v3
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@ -5870,7 +5868,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3]
; GFX10-NEXT: s_sub_i32 s0, s8, 64
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s8, 0
; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
@ -5879,12 +5877,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s0, 1, s0
; GFX10-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3]
; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0
; GFX10-NEXT: v_or_b32_e32 v0, s4, v0
@ -6128,10 +6126,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: s_sub_i32 s5, 1, 64
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_lshlrev_b64 v[13:14], s5, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 1, 0
; GFX10-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v6, v4, v6
@ -6139,8 +6137,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: v_or_b32_e32 v5, v5, v7
; GFX10-NEXT: s_and_b32 s5, 1, s7
; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v13, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v14, v5, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc_lo
; GFX10-NEXT: s_sub_i32 s5, s6, 64
@ -6148,7 +6146,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
; GFX10-NEXT: s_sub_i32 s4, 64, s6
; GFX10-NEXT: s_cmp_lt_u32 s6, 64
; GFX10-NEXT: v_lshrrev_b64 v[11:12], s4, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[0:1]
; GFX10-NEXT: s_cmp_eq_u32 s6, 0
@ -6156,8 +6154,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo
; GFX10-NEXT: v_lshlrev_b64 v[4:5], s5, v[4:5]
; GFX10-NEXT: v_or_b32_e32 v2, v11, v6
; GFX10-NEXT: v_or_b32_e32 v3, v12, v7
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
; GFX10-NEXT: s_sub_i32 s10, s8, 64
; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo
@ -6538,22 +6536,22 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
; GFX10-NEXT: s_cmp_eq_u32 63, 0
; GFX10-NEXT: v_lshrrev_b64 v[8:9], s4, v[0:1]
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_lshlrev_b64 v[14:15], s5, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 31, v2
; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v11, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v11, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
; GFX10-NEXT: s_movk_i32 s6, 0x41
; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: s_sub_i32 s5, 64, s6
; GFX10-NEXT: v_or_b32_e32 v12, v9, v10
; GFX10-NEXT: v_cndmask_b32_e32 v19, v14, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v8, vcc_lo
; GFX10-NEXT: v_lshlrev_b64 v[10:11], s5, v[6:7]
; GFX10-NEXT: v_lshrrev_b64 v[8:9], s6, v[4:5]
; GFX10-NEXT: s_sub_i32 s5, s6, 64
; GFX10-NEXT: s_cmp_lt_u32 s6, 64
; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[15:16], s5, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v12, v1, v12, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[6:7]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s6, 0
; GFX10-NEXT: v_or_b32_e32 v8, v8, v10
@ -6563,17 +6561,17 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
; GFX10-NEXT: s_and_b32 s5, 1, s5
; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6
; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s4
; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v5, s5
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s5
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s5
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s5
; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v7, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v1, v23, v1
; GFX10-NEXT: v_or_b32_e32 v1, v13, v1
; GFX10-NEXT: v_or_b32_e32 v2, v2, v4
; GFX10-NEXT: v_or_b32_e32 v3, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -6921,10 +6919,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17]
; GFX10-NEXT: s_sub_i32 s31, 64, 1
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: s_mov_b32 s62, s10
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 1, 0
; GFX10-NEXT: s_mov_b32 s63, s11
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s31
; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], 1
@ -6935,23 +6931,23 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_cselect_b64 s[26:27], s[28:29], 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s23, 0
; GFX10-NEXT: s_cselect_b64 s[46:47], s[2:3], s[0:1]
; GFX10-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1]
; GFX10-NEXT: s_sub_i32 s23, s16, 64
; GFX10-NEXT: s_sub_i32 s2, 64, s16
; GFX10-NEXT: s_cmp_lt_u32 s16, 64
; GFX10-NEXT: s_cselect_b32 s28, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s16, 0
; GFX10-NEXT: s_cselect_b32 s29, 1, 0
; GFX10-NEXT: s_lshl_b64 s[24:25], s[46:47], s16
; GFX10-NEXT: s_lshl_b64 s[24:25], s[0:1], s16
; GFX10-NEXT: s_lshr_b64 s[2:3], s[26:27], s2
; GFX10-NEXT: s_lshl_b64 s[16:17], s[26:27], s16
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25]
; GFX10-NEXT: s_lshl_b64 s[24:25], s[26:27], s23
; GFX10-NEXT: s_cmp_lg_u32 s28, 0
; GFX10-NEXT: s_cselect_b64 s[78:79], s[16:17], 0
; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[24:25]
; GFX10-NEXT: s_cmp_lg_u32 s29, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[46:47], s[2:3]
; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3]
; GFX10-NEXT: s_sub_i32 s26, s22, 64
; GFX10-NEXT: s_sub_i32 s23, 64, s22
; GFX10-NEXT: s_cmp_lt_u32 s22, 64
@ -6959,17 +6955,17 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX10-NEXT: s_cmp_eq_u32 s22, 0
; GFX10-NEXT: s_cselect_b32 s28, 1, 0
; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s22
; GFX10-NEXT: s_lshl_b64 s[24:25], s[62:63], s23
; GFX10-NEXT: s_lshr_b64 s[22:23], s[62:63], s22
; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s23
; GFX10-NEXT: s_lshr_b64 s[22:23], s[10:11], s22
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25]
; GFX10-NEXT: s_lshr_b64 s[10:11], s[62:63], s26
; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s26
; GFX10-NEXT: s_cmp_lg_u32 s27, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
; GFX10-NEXT: s_cmp_lg_u32 s28, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s27, 0
; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0
; GFX10-NEXT: s_or_b64 s[0:1], s[78:79], s[0:1]
; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1]
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19]
; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21]
@ -7413,7 +7409,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: s_sub_i32 s5, 64, 1
; GFX10-NEXT: s_sub_i32 s6, 1, 64
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[17:18], s5, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[21:22], 1, v[2:3]
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 1, 0
@ -7421,117 +7417,115 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo
; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1]
; GFX10-NEXT: v_xor_b32_e32 v19, -1, v16
; GFX10-NEXT: v_or_b32_e32 v21, v27, v21
; GFX10-NEXT: v_or_b32_e32 v18, v28, v22
; GFX10-NEXT: v_or_b32_e32 v21, v17, v21
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
; GFX10-NEXT: v_or_b32_e32 v18, v18, v22
; GFX10-NEXT: v_xor_b32_e32 v19, -1, v16
; GFX10-NEXT: s_movk_i32 s7, 0x7f
; GFX10-NEXT: s_and_b32 s8, 1, s8
; GFX10-NEXT: v_and_b32_e32 v31, s7, v19
; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v23, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v18, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v21, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v18, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8
; GFX10-NEXT: v_and_b32_e32 v25, s7, v19
; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v23, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v24, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v19, 64, v31
; GFX10-NEXT: v_and_b32_e32 v26, s7, v16
; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v31
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v19, v[17:18]
; GFX10-NEXT: v_mov_b32_e32 v35, v10
; GFX10-NEXT: v_mov_b32_e32 v36, v11
; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v26
; GFX10-NEXT: v_lshlrev_b64 v[21:22], v31, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[23:24], v31, v[17:18]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v31
; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v26
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v25
; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v25
; GFX10-NEXT: v_sub_nc_u32_e32 v19, 64, v26
; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[17:18]
; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[17:18]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26
; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[4:5]
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_or_b32_e32 v21, v2, v21
; GFX10-NEXT: v_or_b32_e32 v22, v3, v22
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v16, v[17:18]
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v25, v[35:36]
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9]
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v19, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v39, 0, v24, vcc_lo
; GFX10-NEXT: s_cmp_lt_u32 1, 64
; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v21, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v22, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v29, v[35:36]
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v27, v[10:11]
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25
; GFX10-NEXT: v_or_b32_e32 v16, v16, v18
; GFX10-NEXT: v_or_b32_e32 v17, v17, v19
; GFX10-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7]
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: v_xor_b32_e32 v25, -1, v20
; GFX10-NEXT: v_cndmask_b32_e32 v18, v21, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v31, v22, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v16, s4
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26
; GFX10-NEXT: s_cmp_eq_u32 1, 0
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e64 v19, v3, v17, s4
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[16:17], 1, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[4:5], s6, v[4:5]
; GFX10-NEXT: s_and_b32 s6, 1, s5
; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v8, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v25, -1, v20
; GFX10-NEXT: v_or_b32_e32 v2, v27, v10
; GFX10-NEXT: v_or_b32_e32 v3, v28, v11
; GFX10-NEXT: v_lshrrev_b64 v[2:3], s5, v[4:5]
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 1, 0
; GFX10-NEXT: v_lshlrev_b64 v[4:5], s6, v[4:5]
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_and_b32 s6, 1, s5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v10
; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s6
; GFX10-NEXT: v_or_b32_e32 v3, v3, v11
; GFX10-NEXT: s_and_b32 s8, 1, s8
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v26, v[35:36]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v16, s5
; GFX10-NEXT: v_cndmask_b32_e32 v16, v19, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v11, v4, v2, s6
; GFX10-NEXT: v_and_b32_e32 v30, s7, v25
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v2, s6
; GFX10-NEXT: v_and_b32_e32 v25, s7, v25
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v3, s6
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v17, s5
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, v0, s4
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v30
; GFX10-NEXT: v_or_b32_e32 v0, v23, v21
; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v23, s7, v20
; GFX10-NEXT: v_lshrrev_b64 v[5:6], v2, v[8:9]
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 64, v30
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v1, s4
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v30, v[3:4]
; GFX10-NEXT: v_cndmask_b32_e32 v3, v2, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v25
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 64, v25
; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23
; GFX10-NEXT: v_or_b32_e32 v1, v39, v16
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v1, s4
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[3:4]
; GFX10-NEXT: v_lshrrev_b64 v[5:6], v2, v[8:9]
; GFX10-NEXT: v_or_b32_e32 v1, v24, v16
; GFX10-NEXT: v_or_b32_e32 v2, v18, v19
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v30, v[8:9]
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v25, v[8:9]
; GFX10-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13]
; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
; GFX10-NEXT: v_or_b32_e32 v10, v5, v10
; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 64, v23
; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v30
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, v[8:9]
; GFX10-NEXT: v_or_b32_e32 v9, v6, v11
; GFX10-NEXT: v_lshrrev_b64 v[34:35], v5, v[14:15]
; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v23
; GFX10-NEXT: v_lshrrev_b64 v[5:6], v5, v[14:15]
; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v16, v18, v20
; GFX10-NEXT: v_or_b32_e32 v18, v19, v21
; GFX10-NEXT: v_cndmask_b32_e32 v10, v7, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[7:8], v23, v[14:15]
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v30
; GFX10-NEXT: v_cndmask_b32_e64 v5, v34, v16, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v25
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v16, s4
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v23
; GFX10-NEXT: v_cndmask_b32_e64 v6, v35, v18, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v18, s4
; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v3, s6
; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v3, s6
; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v4, s6
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v12, s5
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v13, s5
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v7, s4
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s4
; GFX10-NEXT: v_or_b32_e32 v3, v31, v26
; GFX10-NEXT: v_or_b32_e32 v3, v22, v26
; GFX10-NEXT: v_or_b32_e32 v4, v11, v4
; GFX10-NEXT: v_or_b32_e32 v5, v14, v5
; GFX10-NEXT: v_or_b32_e32 v6, v15, v6
; GFX10-NEXT: v_or_b32_e32 v6, v10, v6
; GFX10-NEXT: v_or_b32_e32 v7, v9, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)

View File

@ -2235,8 +2235,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, v5, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v11, v6, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1
; GFX10-NEXT: v_and_or_b32 v7, v2, v7, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0
@ -2482,8 +2482,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, v3, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v7, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
@ -2902,21 +2902,21 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: s_and_b32 s9, s2, s8
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7
; GFX10-NEXT: s_and_b32 s3, s3, 1
; GFX10-NEXT: v_mov_b32_e32 v13, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: s_lshl_b32 s3, s3, 4
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: s_lshl_b32 s8, s8, s3
; GFX10-NEXT: s_lshl_b32 s3, s9, s3
; GFX10-NEXT: s_not_b32 s8, s8
; GFX10-NEXT: v_mov_b32_e32 v13, 0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v11, v2, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s5
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s6
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2
; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3
; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0
@ -3822,19 +3822,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v5, s0
; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3
; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5
; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2
; GFX10-NEXT: v_mov_b32_e32 v11, 0
@ -4020,16 +4020,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: s_lshl_b32 s7, s8, s7
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0
; GFX10-NEXT: s_not_b32 s7, s7
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_mov_b32_e32 v14, 0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s4
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5
; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1
@ -4201,6 +4201,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: s_mov_b32 s4, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v15, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0
@ -4220,9 +4221,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s3
; GFX10-NEXT: v_mov_b32_e32 v15, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5
; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2

View File

@ -1638,11 +1638,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8
; GFX10-NEXT: s_movk_i32 s0, 0xff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX10-NEXT: v_and_b32_sdwa v4, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v6, v0, s0, v1
; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_lshl_b32 s1, s0, s1
; GFX10-NEXT: s_not_b32 s1, s1
; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v4, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
@ -1794,9 +1794,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
; GFX10-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GFX10-NEXT: v_and_or_b32 v3, v0, s1, v3
; GFX10-NEXT: v_or3_b32 v0, v3, v6, v4
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
@ -1804,10 +1804,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GFX10-NEXT: v_and_or_b32 v3, v0, v3, v1
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10-NEXT: v_or3_b32 v2, v0, v2, v4
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or3_b32 v2, v3, v2, v4
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
%vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
@ -2324,13 +2324,13 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8
; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v3
; GFX10-NEXT: s_lshr_b32 s0, s3, 2
; GFX10-NEXT: s_and_b32 s3, s3, 3
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5
; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5
; GFX10-NEXT: s_lshl_b32 s3, s3, 3
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0
; GFX10-NEXT: s_lshl_b32 s4, s1, s3
@ -2629,12 +2629,12 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt
; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2
; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3
; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_endpgm
%vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@ -2905,20 +2905,20 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2
; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3
; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_endpgm
%vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@ -3186,20 +3186,20 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4
; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2
; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3
; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4
; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_endpgm
%vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr
@ -3397,7 +3397,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
;
; GFX10-LABEL: insertelement_v_v8i8_s_v:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx2 v[11:12], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: v_and_b32_e32 v3, 3, v2
; GFX10-NEXT: s_movk_i32 s1, 0xff
@ -3405,22 +3405,22 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v12
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v12
; GFX10-NEXT: v_and_b32_sdwa v8, v11, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT: v_and_b32_sdwa v8, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v9, v12, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v9, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_or_b32 v11, v11, s1, v4
; GFX10-NEXT: v_and_or_b32 v10, v12, s1, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v4
; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v5
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s1
; GFX10-NEXT: s_and_b32 s0, s2, s1
; GFX10-NEXT: v_or3_b32 v0, v11, v8, v6
; GFX10-NEXT: v_or3_b32 v1, v10, v9, v7
; GFX10-NEXT: v_or3_b32 v0, v0, v8, v6
; GFX10-NEXT: v_or3_b32 v1, v1, v9, v7
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0
; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
@ -3906,34 +3906,34 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8
; GFX10-NEXT: v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_and_or_b32 v15, v0, s1, v6
; GFX10-NEXT: v_and_or_b32 v14, v1, s1, v7
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6
; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v6, v4, v5
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3
; GFX10-NEXT: v_or3_b32 v0, v15, v10, v8
; GFX10-NEXT: v_or3_b32 v1, v14, v11, v9
; GFX10-NEXT: v_or3_b32 v0, v0, v10, v8
; GFX10-NEXT: v_or3_b32 v1, v1, v11, v9
; GFX10-NEXT: v_xor_b32_e32 v4, -1, v6
; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo
; GFX10-NEXT: v_and_or_b32 v2, v7, v4, v2
; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo
; GFX10-NEXT: v_and_or_b32 v2, v6, v4, v2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0
; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v2, 8
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v4, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_and_b32_sdwa v4, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_or_b32 v3, v0, v5, v3
; GFX10-NEXT: v_and_or_b32 v1, v11, v5, v2
; GFX10-NEXT: v_or3_b32 v0, v3, v8, v6
; GFX10-NEXT: v_and_or_b32 v0, v0, v5, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_or3_b32 v1, v1, v4, v7
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_or3_b32 v0, v0, v8, v6
; GFX10-NEXT: v_or3_b32 v1, v1, v4, v7
; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX10-NEXT: s_endpgm
%vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr
@ -4820,60 +4820,60 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10
; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v9
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or3_b32 v7, v1, v14, v8
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or3_b32 v1, v1, v14, v8
; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v12
; GFX10-NEXT: v_or3_b32 v2, v2, v15, v5
; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v10
; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc_lo
; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v7
; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2
; GFX10-NEXT: v_or3_b32 v3, v3, v16, v6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s1
; GFX10-NEXT: v_and_or_b32 v5, v5, s3, s2
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v13, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5
; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_and_or_b32 v5, v2, s4, v9
; GFX10-NEXT: v_and_b32_sdwa v14, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5
; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_and_or_b32 v18, v3, s4, v4
; GFX10-NEXT: v_or3_b32 v2, v5, v15, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v9
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6
; GFX10-NEXT: v_or3_b32 v1, v1, v14, v8
; GFX10-NEXT: v_or3_b32 v3, v18, v16, v11
; GFX10-NEXT: v_or3_b32 v3, v3, v16, v11
; GFX10-NEXT: v_or3_b32 v2, v2, v15, v10
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_endpgm
@ -5323,12 +5323,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2
; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
@ -5337,7 +5336,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@ -5347,18 +5346,19 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4
; GFX10-NEXT: v_and_or_b32 v4, v0, s5, v4
; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8
; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9
; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5
; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7
; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10
; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11
; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10
; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_endpgm
@ -5814,16 +5814,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: v_mov_b32_e32 v3, s11
; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1
; GFX10-NEXT: s_mov_b32 s2, 8
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2
@ -5831,23 +5831,23 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3
; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4
; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v4, v0, s5, v4
; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6
; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8
; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8
; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9
; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5
; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7
; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10
; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11
; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10
; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_endpgm
@ -6300,16 +6300,16 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1
; GFX10-NEXT: s_mov_b32 s2, 8
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2
@ -6317,23 +6317,23 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3
; GFX10-NEXT: v_and_or_b32 v19, v15, s8, v4
; GFX10-NEXT: v_and_b32_sdwa v0, v15, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v4, v0, s8, v4
; GFX10-NEXT: v_and_b32_sdwa v0, v0, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX10-NEXT: v_and_or_b32 v6, v1, s8, v6
; GFX10-NEXT: v_and_or_b32 v8, v2, s8, v8
; GFX10-NEXT: v_and_or_b32 v9, v3, s8, v9
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_or_b32 v8, v2, s8, v8
; GFX10-NEXT: v_and_b32_sdwa v15, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v2, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_and_or_b32 v9, v3, s8, v9
; GFX10-NEXT: v_and_b32_sdwa v14, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v3, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5
; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7
; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10
; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11
; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10
; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_endpgm
@ -6659,7 +6659,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
; GFX10-LABEL: insertelement_v_v16i8_s_v:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v22, 8
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: s_movk_i32 s3, 0xff
; GFX10-NEXT: v_and_b32_e32 v0, 3, v2
@ -6669,76 +6669,76 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v5
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v26, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6
; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_or_b32 v19, v4, s3, v9
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v15, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v16, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5
; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_and_or_b32 v26, v3, s3, v26
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6
; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6
; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v17, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6
; GFX10-NEXT: v_or3_b32 v3, v3, v15, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v12
; GFX10-NEXT: v_and_or_b32 v30, v5, s3, v11
; GFX10-NEXT: v_or3_b32 v3, v26, v15, v8
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or3_b32 v26, v19, v16, v10
; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v11
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10
; GFX10-NEXT: v_and_b32_sdwa v18, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_or3_b32 v5, v30, v17, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v14
; GFX10-NEXT: v_and_or_b32 v11, v6, s3, v9
; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v26, vcc_lo
; GFX10-NEXT: v_or3_b32 v5, v5, v17, v7
; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v9
; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2
; GFX10-NEXT: v_lshlrev_b32_e64 v9, v0, s3
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1
; GFX10-NEXT: v_or3_b32 v6, v11, v18, v8
; GFX10-NEXT: v_or3_b32 v6, v6, v18, v8
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v5, s0
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v9
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1
; GFX10-NEXT: v_and_or_b32 v0, v7, v10, v0
; GFX10-NEXT: v_cndmask_b32_e32 v18, v26, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0
; GFX10-NEXT: v_and_or_b32 v0, v7, v8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v0, s2
; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v0, s1
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v18
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v22, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v19, v2, s3, v5
; GFX10-NEXT: v_and_b32_sdwa v14, v18, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v5
; GFX10-NEXT: v_and_b32_sdwa v16, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_and_or_b32 v3, v18, s3, v7
; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v1
; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9
; GFX10-NEXT: v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_or3_b32 v0, v2, v13, v6
; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8
; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11
; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_endpgm
@ -7063,7 +7063,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
; GFX10-LABEL: insertelement_v_v16i8_v_s:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v18, 8
; GFX10-NEXT: v_mov_b32_e32 v0, 8
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: s_movk_i32 s3, 0xff
; GFX10-NEXT: s_lshr_b32 s4, s2, 2
@ -7079,69 +7079,69 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v27, 8, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v5
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v6
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v6
; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v22, v4, s3, v19
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_or_b32 v3, v4, s3, v8
; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v23, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v16, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v6
; GFX10-NEXT: v_or3_b32 v1, v1, v14, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v11
; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v10
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or3_b32 v3, v22, v15, v9
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or3_b32 v3, v3, v15, v9
; GFX10-NEXT: v_and_b32_sdwa v17, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v13
; GFX10-NEXT: v_or3_b32 v4, v5, v23, v4
; GFX10-NEXT: v_or3_b32 v4, v5, v16, v4
; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v8
; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2
; GFX10-NEXT: v_or3_b32 v7, v6, v17, v7
; GFX10-NEXT: v_or3_b32 v6, v6, v17, v7
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s1
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
; GFX10-NEXT: v_and_or_b32 v2, v5, s2, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v19, v1, v2, s2
; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v2, s1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s1
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v19
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v22
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v22
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v19
; GFX10-NEXT: v_and_b32_sdwa v13, v19, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v13, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v19, v19, s3, v5
; GFX10-NEXT: v_and_b32_sdwa v14, v22, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5
; GFX10-NEXT: v_and_b32_sdwa v16, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_and_or_b32 v3, v22, s3, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9
; GFX10-NEXT: v_and_or_b32 v5, v2, s3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8
; GFX10-NEXT: v_and_or_b32 v5, v2, s3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7
; GFX10-NEXT: v_or3_b32 v0, v1, v13, v6
; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10
; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8
; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_endpgm
@ -7489,66 +7489,66 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i
; GFX10-NEXT: v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6
; GFX10-NEXT: v_and_or_b32 v4, v4, s1, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 8, v7
; GFX10-NEXT: v_and_or_b32 v5, v5, s1, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 8, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v14
; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 24, v7
; GFX10-NEXT: v_or3_b32 v4, v4, v17, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v14
; GFX10-NEXT: v_and_or_b32 v6, v6, s1, v13
; GFX10-NEXT: v_or3_b32 v15, v4, v17, v10
; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or3_b32 v5, v5, v18, v12
; GFX10-NEXT: v_and_b32_sdwa v20, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v16
; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v14
; GFX10-NEXT: v_or3_b32 v6, v6, v19, v9
; GFX10-NEXT: v_cndmask_b32_e32 v9, v15, v5, vcc_lo
; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v11
; GFX10-NEXT: v_cndmask_b32_e32 v9, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v11, v0, v1
; GFX10-NEXT: v_or3_b32 v7, v7, v20, v10
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3
; GFX10-NEXT: v_or3_b32 v7, v7, v20, v10
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v6, s0
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v11
; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v7, s1
; GFX10-NEXT: v_and_or_b32 v0, v9, v2, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2
; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v27, v7, v0, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v15, v0, s2
; GFX10-NEXT: v_cndmask_b32_e32 v18, v5, v0, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v19, 8, v27
; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v18
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v4
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v27
; GFX10-NEXT: v_lshlrev_b32_sdwa v23, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v21, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v15, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v19, v2, v1, v23
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_sdwa v14, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v15, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_sdwa v16, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v10, v4, v1, v10
; GFX10-NEXT: v_and_b32_sdwa v17, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v5
; GFX10-NEXT: v_and_b32_sdwa v17, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_and_or_b32 v5, v0, v1, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v13
; GFX10-NEXT: v_and_or_b32 v3, v27, v1, v8
; GFX10-NEXT: v_and_or_b32 v2, v18, v1, v7
; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v7
; GFX10-NEXT: v_and_or_b32 v4, v4, v1, v10
; GFX10-NEXT: v_or3_b32 v0, v2, v14, v6
; GFX10-NEXT: v_or3_b32 v1, v3, v15, v9
; GFX10-NEXT: v_or3_b32 v2, v4, v16, v11
; GFX10-NEXT: v_or3_b32 v3, v5, v17, v12
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_or3_b32 v0, v19, v21, v6
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_or3_b32 v3, v3, v17, v12
; GFX10-NEXT: v_or3_b32 v1, v2, v15, v9
; GFX10-NEXT: v_or3_b32 v2, v10, v16, v11
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr

View File

@ -989,8 +989,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: v_mov_b32_e32 v16, s15
; MOVREL-NEXT: v_mov_b32_e32 v2, s1
; MOVREL-NEXT: v_mov_b32_e32 v1, s0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; MOVREL-NEXT: v_mov_b32_e32 v1, s0
; MOVREL-NEXT: v_mov_b32_e32 v15, s14
; MOVREL-NEXT: v_mov_b32_e32 v14, s13
; MOVREL-NEXT: v_mov_b32_e32 v13, s12
@ -1005,30 +1005,28 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
; MOVREL-NEXT: v_mov_b32_e32 v4, s3
; MOVREL-NEXT: v_mov_b32_e32 v3, s2
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
; MOVREL-NEXT: s_mov_b32 s30, s18
; MOVREL-NEXT: s_mov_b32 s31, s19
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0
; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s30, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s31, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s30, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s31, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0
; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s30, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s31, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s30, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s31, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s30, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s31, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s30, s2
; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s31, s2
; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s30, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s31, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4
; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4
; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s19, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s18, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s19, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s18, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s19, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s18, s2
; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s19, s2
; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s18, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s19, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s18, s4
; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s19, s4
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off
@ -1525,19 +1523,17 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
;
; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
; MOVREL-NEXT: v_mov_b32_e32 v23, v1
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2
@ -2161,8 +2157,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
; MOVREL-NEXT: v_mov_b32_e32 v23, v1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18
@ -2171,9 +2165,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2
@ -3550,28 +3544,28 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v16, s7
; MOVREL-NEXT: v_mov_b32_e32 v9, s0
; MOVREL-NEXT: v_mov_b32_e32 v14, s7
; MOVREL-NEXT: v_mov_b32_e32 v7, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0
; MOVREL-NEXT: v_mov_b32_e32 v10, s1
; MOVREL-NEXT: v_mov_b32_e32 v11, s2
; MOVREL-NEXT: v_mov_b32_e32 v12, s3
; MOVREL-NEXT: v_mov_b32_e32 v13, s4
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo
; MOVREL-NEXT: v_mov_b32_e32 v8, s1
; MOVREL-NEXT: v_mov_b32_e32 v9, s2
; MOVREL-NEXT: v_mov_b32_e32 v10, s3
; MOVREL-NEXT: v_mov_b32_e32 v11, s4
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1
; MOVREL-NEXT: v_mov_b32_e32 v14, s5
; MOVREL-NEXT: v_mov_b32_e32 v15, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v10, v0, vcc_lo
; MOVREL-NEXT: v_mov_b32_e32 v12, s5
; MOVREL-NEXT: v_mov_b32_e32 v13, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 3
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 4
; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 5
; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 6
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc_lo
; MOVREL-NEXT: v_mov_b32_e32 v0, v7
; MOVREL-NEXT: ; return to shader part epilog
entry:
@ -3624,29 +3618,29 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v16, s7
; MOVREL-NEXT: v_mov_b32_e32 v9, s0
; MOVREL-NEXT: v_mov_b32_e32 v15, s7
; MOVREL-NEXT: v_mov_b32_e32 v8, s0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; MOVREL-NEXT: v_mov_b32_e32 v10, s1
; MOVREL-NEXT: v_mov_b32_e32 v11, s2
; MOVREL-NEXT: v_mov_b32_e32 v12, s3
; MOVREL-NEXT: v_mov_b32_e32 v13, s4
; MOVREL-NEXT: v_cndmask_b32_e32 v8, v9, v0, vcc_lo
; MOVREL-NEXT: v_mov_b32_e32 v9, s1
; MOVREL-NEXT: v_mov_b32_e32 v10, s2
; MOVREL-NEXT: v_mov_b32_e32 v11, s3
; MOVREL-NEXT: v_mov_b32_e32 v12, s4
; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; MOVREL-NEXT: v_mov_b32_e32 v14, s5
; MOVREL-NEXT: v_mov_b32_e32 v15, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v0, vcc_lo
; MOVREL-NEXT: v_mov_b32_e32 v13, s5
; MOVREL-NEXT: v_mov_b32_e32 v14, s6
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1
; MOVREL-NEXT: v_mov_b32_e32 v1, v7
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo
; MOVREL-NEXT: v_mov_b32_e32 v0, v8
; MOVREL-NEXT: ; return to shader part epilog
entry:
@ -4128,23 +4122,21 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; MOVREL-LABEL: dyn_insertelement_v7f64_v_v_v:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16
; MOVREL-NEXT: v_mov_b32_e32 v19, v2
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
; MOVREL-NEXT: v_mov_b32_e32 v18, v3
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4
; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5
; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5
@ -4271,38 +4263,38 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_mov_b32 s8, s10
; MOVREL-NEXT: s_mov_b32 s9, s11
; MOVREL-NEXT: v_mov_b32_e32 v20, s15
; MOVREL-NEXT: v_mov_b32_e32 v19, s14
; MOVREL-NEXT: v_mov_b32_e32 v18, s13
; MOVREL-NEXT: v_mov_b32_e32 v17, s12
; MOVREL-NEXT: v_mov_b32_e32 v16, s11
; MOVREL-NEXT: v_mov_b32_e32 v15, s10
; MOVREL-NEXT: v_mov_b32_e32 v14, s9
; MOVREL-NEXT: v_mov_b32_e32 v13, s8
; MOVREL-NEXT: v_mov_b32_e32 v12, s7
; MOVREL-NEXT: v_mov_b32_e32 v11, s6
; MOVREL-NEXT: v_mov_b32_e32 v10, s5
; MOVREL-NEXT: v_mov_b32_e32 v9, s4
; MOVREL-NEXT: v_mov_b32_e32 v8, s3
; MOVREL-NEXT: v_mov_b32_e32 v7, s2
; MOVREL-NEXT: v_mov_b32_e32 v6, s1
; MOVREL-NEXT: v_mov_b32_e32 v5, s0
; MOVREL-NEXT: v_mov_b32_e32 v17, s15
; MOVREL-NEXT: v_mov_b32_e32 v16, s14
; MOVREL-NEXT: v_mov_b32_e32 v15, s13
; MOVREL-NEXT: v_mov_b32_e32 v14, s12
; MOVREL-NEXT: v_mov_b32_e32 v13, s11
; MOVREL-NEXT: v_mov_b32_e32 v12, s10
; MOVREL-NEXT: v_mov_b32_e32 v11, s9
; MOVREL-NEXT: v_mov_b32_e32 v10, s8
; MOVREL-NEXT: v_mov_b32_e32 v9, s7
; MOVREL-NEXT: v_mov_b32_e32 v8, s6
; MOVREL-NEXT: v_mov_b32_e32 v7, s5
; MOVREL-NEXT: v_mov_b32_e32 v6, s4
; MOVREL-NEXT: v_mov_b32_e32 v5, s3
; MOVREL-NEXT: v_mov_b32_e32 v4, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2
; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo
; MOVREL-NEXT: v_readfirstlane_b32 s2, v4
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v0, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v1, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v0, v10, v0, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v1, v11, v1, s1
; MOVREL-NEXT: v_readfirstlane_b32 s0, v2
; MOVREL-NEXT: v_readfirstlane_b32 s1, v3
; MOVREL-NEXT: v_readfirstlane_b32 s3, v5
@ -4466,15 +4458,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
; MOVREL-NEXT: v_mov_b32_e32 v15, v2
; MOVREL-NEXT: v_mov_b32_e32 v14, v3
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
; MOVREL-NEXT: v_readfirstlane_b32 s0, v0
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
; MOVREL-NEXT: v_readfirstlane_b32 s3, v3
; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
@ -4531,15 +4521,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
; MOVREL-NEXT: v_mov_b32_e32 v15, v2
; MOVREL-NEXT: v_mov_b32_e32 v14, v3
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; MOVREL-NEXT: v_readfirstlane_b32 s0, v0
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
; MOVREL-NEXT: v_readfirstlane_b32 s2, v2
; MOVREL-NEXT: v_readfirstlane_b32 s3, v3

View File

@ -1828,10 +1828,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v8, s3
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v7, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v8, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc

View File

@ -29,7 +29,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
;
; GFX10NSA-LABEL: gather4_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@ -45,7 +45,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -83,7 +83,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10NSA-LABEL: gather4_cube:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -140,7 +140,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10NSA-LABEL: gather4_2darray:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
@ -159,7 +159,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -195,7 +195,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10NSA-LABEL: gather4_c_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@ -211,7 +211,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -249,7 +249,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10NSA-LABEL: gather4_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
@ -268,7 +268,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -306,7 +306,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10NSA-LABEL: gather4_c_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
@ -325,7 +325,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -361,7 +361,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10NSA-LABEL: gather4_b_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@ -377,7 +377,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -413,7 +413,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10NSA-LABEL: gather4_c_b_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@ -429,7 +429,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -467,7 +467,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10NSA-LABEL: gather4_b_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
@ -486,7 +486,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog
@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10NSA-LABEL: gather4_c_b_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff
@ -543,7 +543,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog

View File

@ -80,7 +80,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10NSA-LABEL: gather4_2d_tfe:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v5, v0
@ -101,7 +101,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10NSA-NEXT: v_mov_b32_e32 v2, v0
; GFX10NSA-NEXT: v_mov_b32_e32 v3, v0
; GFX10NSA-NEXT: v_mov_b32_e32 v4, v0
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
; GFX10NSA-NEXT: ; return to shader part epilog

View File

@ -65,16 +65,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
;
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v7, v2
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: v_mov_b32_e32 v12, v11
; GFX10-NEXT: v_mov_b32_e32 v13, v11
; GFX10-NEXT: v_mov_b32_e32 v14, v11
; GFX10-NEXT: v_mov_b32_e32 v15, v11
; GFX10-NEXT: v_mov_b32_e32 v0, v11
; GFX10-NEXT: v_mov_b32_e32 v9, v8
; GFX10-NEXT: v_mov_b32_e32 v10, v8
; GFX10-NEXT: v_mov_b32_e32 v11, v8
; GFX10-NEXT: v_mov_b32_e32 v12, v8
; GFX10-NEXT: v_mov_b32_e32 v0, v8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
@ -82,13 +82,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: v_mov_b32_e32 v1, v12
; GFX10-NEXT: v_mov_b32_e32 v2, v13
; GFX10-NEXT: v_mov_b32_e32 v3, v14
; GFX10-NEXT: v_mov_b32_e32 v4, v15
; GFX10-NEXT: v_mov_b32_e32 v1, v9
; GFX10-NEXT: v_mov_b32_e32 v2, v10
; GFX10-NEXT: v_mov_b32_e32 v3, v11
; GFX10-NEXT: v_mov_b32_e32 v4, v12
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v11, v4, s[10:11]
; GFX10-NEXT: global_store_dword v8, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0)
@ -129,16 +129,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
;
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v7, v2
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: v_mov_b32_e32 v12, v11
; GFX10-NEXT: v_mov_b32_e32 v13, v11
; GFX10-NEXT: v_mov_b32_e32 v14, v11
; GFX10-NEXT: v_mov_b32_e32 v15, v11
; GFX10-NEXT: v_mov_b32_e32 v0, v11
; GFX10-NEXT: v_mov_b32_e32 v9, v8
; GFX10-NEXT: v_mov_b32_e32 v10, v8
; GFX10-NEXT: v_mov_b32_e32 v11, v8
; GFX10-NEXT: v_mov_b32_e32 v12, v8
; GFX10-NEXT: v_mov_b32_e32 v0, v8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
@ -146,13 +146,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: v_mov_b32_e32 v1, v12
; GFX10-NEXT: v_mov_b32_e32 v2, v13
; GFX10-NEXT: v_mov_b32_e32 v3, v14
; GFX10-NEXT: v_mov_b32_e32 v4, v15
; GFX10-NEXT: v_mov_b32_e32 v1, v9
; GFX10-NEXT: v_mov_b32_e32 v2, v10
; GFX10-NEXT: v_mov_b32_e32 v3, v11
; GFX10-NEXT: v_mov_b32_e32 v4, v12
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v11, v4, s[10:11]
; GFX10-NEXT: global_store_dword v8, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)

View File

@ -22,9 +22,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -35,14 +35,14 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
; GFX10-LABEL: sample_d_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v11, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v11, s12
; GFX10-NEXT: v_and_or_b32 v2, v3, v11, v4
; GFX10-NEXT: v_and_or_b32 v3, v5, v11, s12
; GFX10-NEXT: v_and_or_b32 v0, v0, v9, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v9, s12
; GFX10-NEXT: v_and_or_b32 v2, v3, v9, v4
; GFX10-NEXT: v_and_or_b32 v3, v5, v9, s12
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -72,9 +72,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -85,10 +85,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_d_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -102,10 +102,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -116,10 +116,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_c_d_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -133,9 +133,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10
; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -165,9 +165,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -196,9 +196,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -209,10 +209,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -226,10 +226,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -240,10 +240,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -257,9 +257,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10
; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -273,9 +273,9 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11
; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5
; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -289,9 +289,9 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11
; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog

View File

@ -79,9 +79,9 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GFX10-NEXT: v_or3_b32 v7, v0, v1, v2
; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2
; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5
; GFX10-NEXT: v_dot4_i32_i8 v0, v7, v1, v8
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%a.cast = bitcast <4 x i8> %a to i32
%b.cast = bitcast <4 x i8> %b to i32

View File

@ -79,9 +79,9 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GFX10-NEXT: v_or3_b32 v7, v0, v1, v2
; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2
; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5
; GFX10-NEXT: v_dot4_u32_u8 v0, v7, v1, v8
; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%a.cast = bitcast <4 x i8> %a to i32
%b.cast = bitcast <4 x i8> %b to i32

View File

@ -351,8 +351,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo
; GFX10-32-NEXT: s_cbranch_scc0 BB3_4
; GFX10-32-NEXT: ; %bb.2: ; %.demote
; GFX10-32-NEXT: s_wqm_b32 s28, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-32-NEXT: s_wqm_b32 s14, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10-32-NEXT: BB3_3: ; %.continue
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@ -374,7 +374,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_wqm_b64 exec, exec
; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15]
; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; GFX10-64-NEXT: s_cbranch_execz BB3_3
; GFX10-64-NEXT: ; %bb.1: ; %.demote
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
@ -383,7 +383,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17]
; GFX10-64-NEXT: BB3_3: ; %.continue
; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
@ -487,8 +487,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo
; GFX10-32-NEXT: s_cbranch_scc0 BB4_4
; GFX10-32-NEXT: ; %bb.2: ; %.demote
; GFX10-32-NEXT: s_wqm_b32 s28, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-32-NEXT: s_wqm_b32 s14, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10-32-NEXT: BB4_3: ; %.continue
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0
@ -510,7 +510,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15]
; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; GFX10-64-NEXT: s_cbranch_execz BB4_3
; GFX10-64-NEXT: ; %bb.1: ; %.demote
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
@ -519,7 +519,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17]
; GFX10-64-NEXT: BB4_3: ; %.continue
; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@ -632,8 +632,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX10-64-NEXT: s_cbranch_scc0 BB5_2
; GFX10-64-NEXT: ; %bb.1: ; %.entry
; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29]
; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15]
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D

View File

@ -192,7 +192,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: ds_read_u8 v10, v0 offset:8
; GFX10-NEXT: ds_read_u8 v12, v0 offset:10
; GFX10-NEXT: ds_read_u8 v13, v0 offset:11
; GFX10-NEXT: ds_read_u8 v25, v0 offset:12
; GFX10-NEXT: ds_read_u8 v14, v0 offset:12
; GFX10-NEXT: ds_read_u8 v15, v0 offset:13
; GFX10-NEXT: ds_read_u8 v16, v0 offset:14
; GFX10-NEXT: ds_read_u8 v0, v0 offset:15
@ -213,7 +213,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: s_waitcnt lgkmcnt(10)
; GFX10-NEXT: v_and_b32_e32 v6, v6, v11
; GFX10-NEXT: s_waitcnt lgkmcnt(9)
; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(8)
; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
@ -221,7 +221,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
; GFX10-NEXT: v_and_b32_e32 v9, v13, v11
; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21
; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
@ -230,7 +230,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: v_and_b32_e32 v0, v0, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10
; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5

View File

@ -158,11 +158,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: ds_read_u8 v5, v0 offset:6
; GFX10-NEXT: ds_read_u8 v6, v0 offset:7
; GFX10-NEXT: ds_read_u8 v7, v0 offset:9
; GFX10-NEXT: ds_read_u8 v15, v0 offset:10
; GFX10-NEXT: ds_read_u8 v8, v0 offset:10
; GFX10-NEXT: ds_read_u8 v9, v0 offset:11
; GFX10-NEXT: ds_read_u8 v10, v0
; GFX10-NEXT: ds_read_u8 v11, v0 offset:4
; GFX10-NEXT: ds_read_u8 v14, v0 offset:8
; GFX10-NEXT: ds_read_u8 v0, v0 offset:8
; GFX10-NEXT: v_mov_b32_e32 v12, 0xff
; GFX10-NEXT: v_mov_b32_e32 v13, 8
; GFX10-NEXT: s_movk_i32 s4, 0xff
@ -182,19 +182,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
; GFX10-NEXT: v_and_b32_e32 v8, v15, v12
; GFX10-NEXT: v_and_b32_e32 v8, v8, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(3)
; GFX10-NEXT: v_and_b32_e32 v9, v9, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7
; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3
@ -266,9 +265,9 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: ds_read_u16 v1, v0 offset:2
; GFX10-NEXT: ds_read_u16 v2, v0 offset:6
; GFX10-NEXT: ds_read_u16 v3, v0 offset:10
; GFX10-NEXT: ds_read_u16 v7, v0
; GFX10-NEXT: ds_read_u16 v11, v0 offset:4
; GFX10-NEXT: ds_read_u16 v15, v0 offset:8
; GFX10-NEXT: ds_read_u16 v4, v0
; GFX10-NEXT: ds_read_u16 v5, v0 offset:4
; GFX10-NEXT: ds_read_u16 v6, v0 offset:8
; GFX10-NEXT: s_mov_b32 s4, 0xffff
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
; GFX10-NEXT: v_and_b32_e32 v0, s4, v1
@ -280,11 +279,11 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
; GFX10-NEXT: v_and_or_b32 v0, v7, s4, v0
; GFX10-NEXT: v_and_or_b32 v0, v4, s4, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-NEXT: v_and_or_b32 v1, v11, s4, v1
; GFX10-NEXT: v_and_or_b32 v1, v5, s4, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_and_or_b32 v2, v15, s4, v2
; GFX10-NEXT: v_and_or_b32 v2, v6, s4, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
ret <3 x i32> %load

View File

@ -108,7 +108,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: ds_read_u8 v10, v0 offset:8
; GFX10-NEXT: ds_read_u8 v12, v0 offset:10
; GFX10-NEXT: ds_read_u8 v13, v0 offset:11
; GFX10-NEXT: ds_read_u8 v25, v0 offset:12
; GFX10-NEXT: ds_read_u8 v14, v0 offset:12
; GFX10-NEXT: ds_read_u8 v15, v0 offset:13
; GFX10-NEXT: ds_read_u8 v16, v0 offset:14
; GFX10-NEXT: ds_read_u8 v0, v0 offset:15
@ -129,7 +129,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: s_waitcnt lgkmcnt(10)
; GFX10-NEXT: v_and_b32_e32 v6, v6, v11
; GFX10-NEXT: s_waitcnt lgkmcnt(9)
; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(8)
; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
@ -137,7 +137,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
; GFX10-NEXT: v_and_b32_e32 v9, v13, v11
; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21
; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
@ -146,7 +146,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: v_and_b32_e32 v0, v0, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10
; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
@ -242,11 +242,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: ds_read_u8 v5, v0 offset:6
; GFX10-NEXT: ds_read_u8 v6, v0 offset:7
; GFX10-NEXT: ds_read_u8 v7, v0 offset:9
; GFX10-NEXT: ds_read_u8 v15, v0 offset:10
; GFX10-NEXT: ds_read_u8 v8, v0 offset:10
; GFX10-NEXT: ds_read_u8 v9, v0 offset:11
; GFX10-NEXT: ds_read_u8 v10, v0
; GFX10-NEXT: ds_read_u8 v11, v0 offset:4
; GFX10-NEXT: ds_read_u8 v14, v0 offset:8
; GFX10-NEXT: ds_read_u8 v0, v0 offset:8
; GFX10-NEXT: v_mov_b32_e32 v12, 0xff
; GFX10-NEXT: v_mov_b32_e32 v13, 8
; GFX10-NEXT: s_movk_i32 s4, 0xff
@ -266,19 +266,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX10-NEXT: s_waitcnt lgkmcnt(5)
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: s_waitcnt lgkmcnt(4)
; GFX10-NEXT: v_and_b32_e32 v8, v15, v12
; GFX10-NEXT: v_and_b32_e32 v8, v8, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(3)
; GFX10-NEXT: v_and_b32_e32 v9, v9, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(2)
; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(1)
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7
; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3
@ -410,27 +409,27 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1
; GFX10-NEXT: ds_write_b8 v0, v1
; GFX10-NEXT: ds_write_b8 v0, v7 offset:1
; GFX10-NEXT: ds_write_b8 v0, v4 offset:1
; GFX10-NEXT: ds_write_b8 v0, v5 offset:2
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX10-NEXT: ds_write_b8 v0, v6 offset:3
; GFX10-NEXT: ds_write_b8 v0, v2 offset:4
; GFX10-NEXT: ds_write_b8 v0, v1 offset:5
; GFX10-NEXT: ds_write_b8 v0, v7 offset:6
; GFX10-NEXT: ds_write_b8 v0, v4 offset:6
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v3
; GFX10-NEXT: ds_write_b8 v0, v5 offset:7
; GFX10-NEXT: ds_write_b8 v0, v3 offset:8
; GFX10-NEXT: ds_write_b8 v0, v1 offset:9
; GFX10-NEXT: ds_write_b8 v0, v2 offset:10
; GFX10-NEXT: ds_write_b8 v0, v7 offset:11
; GFX10-NEXT: ds_write_b8 v0, v4 offset:11
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1

View File

@ -1608,12 +1608,8 @@ define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v7, v2
; GFX10-NEXT: v_mov_b32_e32 v8, v3
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[10:11]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[7:8]
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = lshr <2 x i64> %value, %amount
ret <2 x i64> %result

View File

@ -585,12 +585,12 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3
; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9
; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7
; GFX10-NEXT: v_add_nc_u32_e32 v3, v11, v6
; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v6
; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
@ -997,24 +997,24 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v18, s4, v13, v11
; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9
; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5
; GFX10-NEXT: v_add_co_u32 v11, s4, v18, v15
; GFX10-NEXT: v_add_co_u32 v11, s4, v11, v15
; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4
; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13
; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9
; GFX10-NEXT: v_add_nc_u32_e32 v10, v3, v10
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
; GFX10-NEXT: v_add3_u32 v10, v10, v13, v7
; GFX10-NEXT: v_add3_u32 v3, v3, v13, v7
; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5
; GFX10-NEXT: v_add3_u32 v1, v10, v15, v1
; GFX10-NEXT: v_add3_u32 v1, v3, v15, v1
; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -2758,13 +2758,15 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9
; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18
; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20
; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18
; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18
@ -2781,7 +2783,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5
@ -2791,11 +2793,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23
; GFX10-NEXT: v_add3_u32 v18, v19, v29, v18
; GFX10-NEXT: v_add3_u32 v18, v19, v22, v18
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add3_u32 v30, v21, v24, v23
; GFX10-NEXT: v_add3_u32 v19, v21, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10
; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11
@ -2813,7 +2814,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v19, v30, v23, v20
; GFX10-NEXT: v_add3_u32 v19, v19, v23, v20
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26
; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
@ -2822,120 +2823,119 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v24
; GFX10-NEXT: v_add3_u32 v35, v23, v30, v21
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v24
; GFX10-NEXT: v_add3_u32 v21, v23, v25, v21
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX10-NEXT: v_cndmask_b32_e64 v34, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26
; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23
; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v20, v35, v25, v20
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23
; GFX10-NEXT: v_add3_u32 v23, v34, v27, v28
; GFX10-NEXT: v_add3_u32 v20, v21, v25, v20
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23
; GFX10-NEXT: v_add3_u32 v23, v24, v27, v28
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10
; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8
; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9
; GFX10-NEXT: v_add_co_u32 v27, s4, v31, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11
; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v26
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10
; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v27, v29
; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v29
; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v33, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28
; GFX10-NEXT: v_add_co_u32 v31, s5, v21, v26
; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v26
; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24
; GFX10-NEXT: v_add_co_u32 v21, s5, v31, v29
; GFX10-NEXT: v_add3_u32 v39, v23, v33, v26
; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v29
; GFX10-NEXT: v_add3_u32 v22, v23, v22, v26
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12
; GFX10-NEXT: v_cndmask_b32_e64 v35, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27
; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9
; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v34, s4, v24, v23
; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20
; GFX10-NEXT: v_add_co_u32 v23, s4, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v35, v28, v35, v29
; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20
; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v27
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12
; GFX10-NEXT: v_add_co_u32 v34, s4, v23, v24
; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v22, v35, v30, v32
; GFX10-NEXT: v_add3_u32 v21, v39, v26, v21
; GFX10-NEXT: v_add_co_u32 v34, s4, v34, v27
; GFX10-NEXT: v_add3_u32 v21, v22, v26, v21
; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v24
; GFX10-NEXT: v_add3_u32 v24, v28, v25, v29
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10
; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_add3_u32 v24, v24, v30, v32
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v31
; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v31
; GFX10-NEXT: v_add3_u32 v22, v24, v28, v27
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v22, v22, v28, v27
; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25
; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX10-NEXT: v_add_co_u32 v30, s4, v23, v26
; GFX10-NEXT: v_add3_u32 v33, v22, v24, v25
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v26
; GFX10-NEXT: v_add3_u32 v22, v22, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11
; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28
; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v30, v27
; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14
; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13
; GFX10-NEXT: v_add3_u32 v33, v33, v26, v27
; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12
; GFX10-NEXT: v_add3_u32 v26, v7, v29, v28
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13
; GFX10-NEXT: v_add3_u32 v7, v26, v24, v15
; GFX10-NEXT: v_add_co_u32 v11, s4, v23, v25
; GFX10-NEXT: v_add3_u32 v7, v7, v29, v28
; GFX10-NEXT: v_add3_u32 v22, v22, v26, v27
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v15
; GFX10-NEXT: v_add_co_u32 v9, s4, v23, v25
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5
; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v21
; GFX10-NEXT: v_add_co_u32 v6, s4, v9, v21
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v3, v5, v4, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v14
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v5, v33, v10, v7
; GFX10-NEXT: v_add3_u32 v3, v3, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v16
; GFX10-NEXT: v_add3_u32 v5, v22, v10, v7
; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v17
; GFX10-NEXT: v_add3_u32 v7, v3, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v3, v18
; GFX10-NEXT: v_add3_u32 v7, v1, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v16
; GFX10-NEXT: v_mov_b32_e32 v4, v19
; GFX10-NEXT: v_mov_b32_e32 v5, v20
; GFX10-NEXT: s_setpc_b64 s[30:31]

View File

@ -413,12 +413,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f16_e32 v3, v1
; GFX10-NEXT: v_rndne_f16_e32 v2, v0
; GFX10-NEXT: v_rndne_f16_sdwa v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_rndne_f16_e32 v3, v1
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v7
; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0
; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)

View File

@ -4199,16 +4199,16 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
@ -4543,30 +4543,26 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v14, v0
; GFX10-NEXT: v_mov_b32_e32 v15, v1
; GFX10-NEXT: v_mov_b32_e32 v17, v2
; GFX10-NEXT: v_mov_b32_e32 v18, v3
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6
; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7]
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v19, vcc_lo, v17, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11
; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18]
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo
; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5
; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
@ -5327,7 +5323,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
; GFX10-NEXT: s_movk_i32 s0, 0x7f
; GFX10-NEXT: s_sub_i32 s1, 64, s0
; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
@ -5335,33 +5330,34 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7]
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5]
; GFX10-NEXT: s_sub_i32 s1, s0, 64
; GFX10-NEXT: s_cmp_lt_u32 s0, 64
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v8, v15, v8
; GFX10-NEXT: v_or_b32_e32 v9, v16, v9
; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7
; GFX10-NEXT: v_or_b32_e32 v8, v0, v8
; GFX10-NEXT: v_or_b32_e32 v9, v1, v9
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
; GFX10-NEXT: s_and_b32 s0, 1, s1
; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
@ -5569,64 +5565,60 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
;
; GFX10-LABEL: saddsat_i128_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_add_co_u32 v15, vcc_lo, v5, s0
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo
; GFX10-NEXT: s_and_b32 s1, 1, s4
; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6]
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20
; GFX10-NEXT: s_and_b32 s1, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10]
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
; GFX10-NEXT: s_movk_i32 s0, 0x7f
; GFX10-NEXT: s_sub_i32 s2, 64, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5]
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20]
; GFX10-NEXT: s_sub_i32 s1, s0, 64
; GFX10-NEXT: s_cmp_lt_u32 s0, 64
; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20]
; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[6:7]
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: v_or_b32_e32 v2, v0, v2
; GFX10-NEXT: v_or_b32_e32 v3, v1, v3
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20]
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7]
; GFX10-NEXT: s_and_b32 s0, 1, s1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@ -5959,28 +5951,20 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v22, v0
; GFX10-NEXT: v_mov_b32_e32 v23, v1
; GFX10-NEXT: v_mov_b32_e32 v20, v2
; GFX10-NEXT: v_mov_b32_e32 v21, v3
; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v0, v8
; GFX10-NEXT: s_movk_i32 s5, 0x7f
; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v22, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
; GFX10-NEXT: s_sub_i32 s6, 64, s5
; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
; GFX10-NEXT: s_sub_i32 s7, s5, 64
; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23]
; GFX10-NEXT: v_mov_b32_e32 v26, v4
; GFX10-NEXT: v_mov_b32_e32 v27, v5
; GFX10-NEXT: v_mov_b32_e32 v24, v6
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v25, v7
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21]
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19]
; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[8:9]
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17]
@ -5991,7 +5975,6 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19
; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19]
@ -5999,33 +5982,34 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v19
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8
; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20
; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s4
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: v_add_co_u32 v8, s4, v26, v12
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v27, v13, s4
; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v24, v14, s4
; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v12
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v13, s4
; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v6, v14, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v25, v15, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27]
; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v7, v15, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5]
; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9]
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25]
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4
; GFX10-NEXT: v_cmp_gt_u64_e64 s4, 0, v[12:13]
; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11]
@ -6035,7 +6019,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_or_b32_e32 v13, v4, v13
; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25]
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15]
; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11]
@ -6049,13 +6033,13 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: s_and_b32 s6, 1, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7
; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11
; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5
; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5
; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7
@ -6592,23 +6576,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s16, s0, s8
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_mov_b32 s46, s0
; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_mov_b32 s47, s1
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_addc_u32 s17, s1, s9
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_addc_u32 s30, s2, s10
; GFX10-NEXT: s_addc_u32 s18, s2, s10
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
; GFX10-NEXT: s_addc_u32 s31, s3, s11
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3]
; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3]
; GFX10-NEXT: s_addc_u32 s19, s3, s11
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
@ -6628,13 +6610,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20
; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22
; GFX10-NEXT: s_lshl_b64 s[8:9], s[18:19], s22
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: s_ashr_i32 s10, s31, 31
; GFX10-NEXT: s_ashr_i32 s10, s19, 31
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20
; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21
; GFX10-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s21
; GFX10-NEXT: s_cmp_lg_u32 s23, 0
; GFX10-NEXT: s_mov_b32 s11, s10
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v3, s31
; GFX10-NEXT: v_mov_b32_e32 v3, s19
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
@ -6669,7 +6651,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
; GFX10-NEXT: s_and_b32 s3, s3, 1
; GFX10-NEXT: v_mov_b32_e32 v2, s30
; GFX10-NEXT: v_mov_b32_e32 v2, s18
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
; GFX10-NEXT: s_addc_u32 s3, s5, s13
; GFX10-NEXT: s_cselect_b32 s8, 1, 0

View File

@ -1571,12 +1571,8 @@ define <2 x i64> @v_shl_v2i64(<2 x i64> %value, <2 x i64> %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v7, v2
; GFX10-NEXT: v_mov_b32_e32 v8, v3
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[10:11]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[7:8]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = shl <2 x i64> %value, %amount
ret <2 x i64> %result

View File

@ -4185,16 +4185,16 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
@ -4529,30 +4529,26 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v14, v0
; GFX10-NEXT: v_mov_b32_e32 v15, v1
; GFX10-NEXT: v_mov_b32_e32 v17, v2
; GFX10-NEXT: v_mov_b32_e32 v18, v3
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v14, v4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6
; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7]
; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v19, vcc_lo, v17, v6
; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11
; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18]
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo
; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5
; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
@ -5313,7 +5309,6 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
; GFX10-NEXT: s_movk_i32 s0, 0x7f
; GFX10-NEXT: s_sub_i32 s1, 64, s0
; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
@ -5321,33 +5316,34 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7]
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5]
; GFX10-NEXT: s_sub_i32 s1, s0, 64
; GFX10-NEXT: s_cmp_lt_u32 s0, 64
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v8, v15, v8
; GFX10-NEXT: v_or_b32_e32 v9, v16, v9
; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7
; GFX10-NEXT: v_or_b32_e32 v8, v0, v8
; GFX10-NEXT: v_or_b32_e32 v9, v1, v9
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo
; GFX10-NEXT: s_and_b32 s0, 1, s1
; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
@ -5555,64 +5551,60 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
;
; GFX10-LABEL: ssubsat_i128_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v5, s0
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0
; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo
; GFX10-NEXT: s_and_b32 s1, 1, s4
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6]
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20
; GFX10-NEXT: s_and_b32 s1, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10]
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
; GFX10-NEXT: s_movk_i32 s0, 0x7f
; GFX10-NEXT: s_sub_i32 s2, 64, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7]
; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5]
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20]
; GFX10-NEXT: s_sub_i32 s1, s0, 64
; GFX10-NEXT: s_cmp_lt_u32 s0, 64
; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20]
; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[6:7]
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: v_or_b32_e32 v2, v0, v2
; GFX10-NEXT: v_or_b32_e32 v3, v1, v3
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20]
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7]
; GFX10-NEXT: s_and_b32 s0, 1, s1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@ -5945,28 +5937,20 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v22, v0
; GFX10-NEXT: v_mov_b32_e32 v23, v1
; GFX10-NEXT: v_mov_b32_e32 v20, v2
; GFX10-NEXT: v_mov_b32_e32 v21, v3
; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8
; GFX10-NEXT: s_movk_i32 s5, 0x7f
; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v22, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
; GFX10-NEXT: s_sub_i32 s6, 64, s5
; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
; GFX10-NEXT: s_sub_i32 s7, s5, 64
; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23]
; GFX10-NEXT: v_mov_b32_e32 v26, v4
; GFX10-NEXT: v_mov_b32_e32 v27, v5
; GFX10-NEXT: v_mov_b32_e32 v24, v6
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v25, v7
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21]
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19]
; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17]
@ -5977,7 +5961,6 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19
; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0
; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19]
@ -5985,33 +5968,34 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo
; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v19
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8
; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20
; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s4
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: v_sub_co_u32 v8, s4, v26, v12
; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v27, v13, s4
; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v24, v14, s4
; GFX10-NEXT: v_sub_co_u32 v8, s4, v4, v12
; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v5, v13, s4
; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v6, v14, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v25, v15, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27]
; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v7, v15, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5]
; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9]
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25]
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13]
; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11]
@ -6021,7 +6005,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_or_b32_e32 v13, v4, v13
; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25]
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4
; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15]
; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11]
@ -6035,13 +6019,13 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: s_and_b32 s6, 1, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7
; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11
; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5
; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5
; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7
@ -6578,23 +6562,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s16, s0, s8
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_mov_b32 s46, s0
; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_mov_b32 s47, s1
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_subb_u32 s17, s1, s9
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_subb_u32 s30, s2, s10
; GFX10-NEXT: s_subb_u32 s18, s2, s10
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
; GFX10-NEXT: s_subb_u32 s31, s3, s11
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3]
; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3]
; GFX10-NEXT: s_subb_u32 s19, s3, s11
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
@ -6614,13 +6596,13 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20
; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22
; GFX10-NEXT: s_lshl_b64 s[8:9], s[18:19], s22
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: s_ashr_i32 s10, s31, 31
; GFX10-NEXT: s_ashr_i32 s10, s19, 31
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20
; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21
; GFX10-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s21
; GFX10-NEXT: s_cmp_lg_u32 s23, 0
; GFX10-NEXT: s_mov_b32 s11, s10
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
@ -6641,7 +6623,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v3, s31
; GFX10-NEXT: v_mov_b32_e32 v3, s19
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
; GFX10-NEXT: s_and_b32 s3, s3, 1
; GFX10-NEXT: v_mov_b32_e32 v2, s30
; GFX10-NEXT: v_mov_b32_e32 v2, s18
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
; GFX10-NEXT: s_subb_u32 s3, s5, s13
; GFX10-NEXT: s_cselect_b32 s8, 1, 0

View File

@ -176,22 +176,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: s_lshr_b32 s9, s6, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: v_mov_b32_e32 v3, s6
; GFX10-NEXT: v_mov_b32_e32 v15, s1
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_mov_b32_e32 v10, s5
; GFX10-NEXT: s_lshr_b32 s0, s6, 24
; GFX10-NEXT: v_mov_b32_e32 v6, s3
; GFX10-NEXT: v_mov_b32_e32 v11, s9
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: v_mov_b32_e32 v8, s4
; GFX10-NEXT: v_mov_b32_e32 v19, s8
; GFX10-NEXT: v_mov_b32_e32 v9, s8
; GFX10-NEXT: ds_write_b8 v1, v0
; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
; GFX10-NEXT: ds_write_b8 v1, v4 offset:1
; GFX10-NEXT: ds_write_b8 v1, v15 offset:2
; GFX10-NEXT: ds_write_b8 v1, v5 offset:2
; GFX10-NEXT: ds_write_b8 v1, v6 offset:3
; GFX10-NEXT: ds_write_b8 v1, v7 offset:5
; GFX10-NEXT: ds_write_b8 v1, v8 offset:6
; GFX10-NEXT: ds_write_b8 v1, v19 offset:7
; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
; GFX10-NEXT: s_lshr_b32 s1, s7, 16
@ -202,12 +202,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: s_lshr_b32 s2, s7, 24
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: v_mov_b32_e32 v4, s1
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: v_mov_b32_e32 v5, s2
; GFX10-NEXT: ds_write_b8 v1, v0 offset:11
; GFX10-NEXT: ds_write_b8 v1, v2 offset:12
; GFX10-NEXT: ds_write_b8 v1, v3 offset:13
; GFX10-NEXT: ds_write_b8 v1, v4 offset:14
; GFX10-NEXT: ds_write_b8 v1, v7 offset:15
; GFX10-NEXT: ds_write_b8 v1, v5 offset:15
; GFX10-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
ret void
@ -286,7 +286,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: s_lshr_b32 s2, s6, 16
; GFX10-NEXT: s_lshr_b32 s3, s7, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s7
; GFX10-NEXT: v_mov_b32_e32 v11, s0
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: v_mov_b32_e32 v6, s1
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: v_mov_b32_e32 v8, s3
@ -294,7 +294,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b16 v1, v2 offset:4
; GFX10-NEXT: ds_write_b16 v1, v3 offset:8
; GFX10-NEXT: ds_write_b16 v1, v4 offset:12
; GFX10-NEXT: ds_write_b16 v1, v11 offset:2
; GFX10-NEXT: ds_write_b16 v1, v5 offset:2
; GFX10-NEXT: ds_write_b16 v1, v6 offset:6
; GFX10-NEXT: ds_write_b16 v1, v7 offset:10
; GFX10-NEXT: ds_write_b16 v1, v8 offset:14

View File

@ -147,12 +147,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: s_lshr_b32 s3, s12, 24
; GFX10-NEXT: s_lshr_b32 s6, s14, 8
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: v_mov_b32_e32 v15, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s5
; GFX10-NEXT: s_lshr_b32 s2, s13, 8
; GFX10-NEXT: s_lshr_b32 s4, s13, 16
; GFX10-NEXT: s_lshr_b32 s7, s14, 16
; GFX10-NEXT: v_mov_b32_e32 v3, s14
; GFX10-NEXT: v_mov_b32_e32 v11, s1
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: s_lshr_b32 s8, s14, 24
; GFX10-NEXT: v_mov_b32_e32 v6, s3
; GFX10-NEXT: v_mov_b32_e32 v10, s6
@ -161,13 +161,13 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b8 v1, v0
; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
; GFX10-NEXT: ds_write_b8 v1, v4 offset:1
; GFX10-NEXT: ds_write_b8 v1, v11 offset:2
; GFX10-NEXT: ds_write_b8 v1, v5 offset:2
; GFX10-NEXT: ds_write_b8 v1, v6 offset:3
; GFX10-NEXT: ds_write_b8 v1, v7 offset:5
; GFX10-NEXT: ds_write_b8 v1, v8 offset:6
; GFX10-NEXT: v_mov_b32_e32 v0, s7
; GFX10-NEXT: v_mov_b32_e32 v2, s8
; GFX10-NEXT: ds_write_b8 v1, v15 offset:7
; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
@ -239,13 +239,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: v_mov_b32_e32 v3, s14
; GFX10-NEXT: s_lshr_b32 s2, s14, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: v_mov_b32_e32 v7, s1
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: ds_write_b16 v1, v0
; GFX10-NEXT: ds_write_b16 v1, v2 offset:4
; GFX10-NEXT: ds_write_b16 v1, v3 offset:8
; GFX10-NEXT: ds_write_b16 v1, v4 offset:2
; GFX10-NEXT: ds_write_b16 v1, v7 offset:6
; GFX10-NEXT: ds_write_b16 v1, v5 offset:6
; GFX10-NEXT: ds_write_b16 v1, v6 offset:10
; GFX10-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2

View File

@ -2819,20 +2819,16 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v11, v5
; GFX10-NEXT: v_mov_b32_e32 v15, v6
; GFX10-NEXT: v_mov_b32_e32 v16, v7
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v10
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo
; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v2, v15
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11]
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, -1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, -1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
@ -3203,22 +3199,22 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
;
; GFX10-LABEL: uaddsat_i128_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[0:1]
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v10, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc_lo
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@ -3435,33 +3431,25 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v18, v8
; GFX10-NEXT: v_mov_b32_e32 v19, v9
; GFX10-NEXT: v_mov_b32_e32 v16, v10
; GFX10-NEXT: v_mov_b32_e32 v17, v11
; GFX10-NEXT: v_mov_b32_e32 v10, v12
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v18
; GFX10-NEXT: v_mov_b32_e32 v11, v13
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v19, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v20, v14
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v16, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v21, v15
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v17, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[18:19]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v10
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v11, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v20, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v21, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[16:17]
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[10:11]
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[20:21]
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[16:17]
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[20:21]
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15]
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8

View File

@ -2689,16 +2689,12 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v10, v4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v11, v5, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5]
; GFX10-NEXT: v_sub_co_u32 v4, s4, v0, v6
; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v1, v7, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[6:7]
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
; GFX10-NEXT: v_sub_co_u32 v4, s4, v2, v6
; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v3, v7, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4
@ -2974,7 +2970,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX10-NEXT: s_and_b32 s10, s10, 1
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: s_subb_u32 s14, s2, s6
; GFX10-NEXT: s_subb_u32 s10, s2, s6
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_and_b32 s11, s11, 1
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
@ -2989,7 +2985,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, s14, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
@ -3305,41 +3301,33 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v22, v0
; GFX10-NEXT: v_mov_b32_e32 v23, v1
; GFX10-NEXT: v_mov_b32_e32 v20, v2
; GFX10-NEXT: v_mov_b32_e32 v21, v3
; GFX10-NEXT: v_mov_b32_e32 v26, v4
; GFX10-NEXT: v_mov_b32_e32 v27, v5
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[22:23], v[8:9]
; GFX10-NEXT: v_mov_b32_e32 v24, v6
; GFX10-NEXT: v_mov_b32_e32 v25, v7
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[6:7], v[14:15]
; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[20:21], v[10:11]
; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[24:25], v[14:15]
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[10:11]
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[26:27], v[12:13]
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[24:25], v[14:15]
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v16
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v22, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v23, v9, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5
; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v20, v10, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v21, v11, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v26, v12
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v27, v13, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v24, v14, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v25, v15, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s5
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s5
@ -3630,7 +3618,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_and_b32 s1, s1, 1
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13]
; GFX10-NEXT: s_subb_u32 s30, s6, s14
; GFX10-NEXT: s_subb_u32 s10, s6, s14
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_and_b32 s0, s0, 1
@ -3656,7 +3644,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: v_readfirstlane_b32 s2, v3
; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, s30, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s3, v4
; GFX10-NEXT: v_readfirstlane_b32 s5, v1

View File

@ -500,12 +500,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB2_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB2_2:
@ -551,11 +551,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB2_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB2_2:
@ -1680,12 +1680,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB9_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB9_2:
@ -1731,11 +1731,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB9_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB9_2:
@ -2534,12 +2534,12 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB14_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB14_2:
@ -2585,11 +2585,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB14_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB14_2:
@ -2768,12 +2768,12 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB15_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB15_2:
@ -2819,11 +2819,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB15_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB15_2:
@ -3002,12 +3002,12 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB16_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB16_2:
@ -3053,11 +3053,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB16_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB16_2:
@ -3238,12 +3238,12 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB17_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB17_2:
@ -3291,11 +3291,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB17_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB17_2:
@ -3655,12 +3655,12 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB19_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB19_2:
@ -3708,11 +3708,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB19_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB19_2:
@ -4070,12 +4070,12 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB21_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB21_2:
@ -4121,11 +4121,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB21_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB21_2:
@ -4480,12 +4480,12 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB23_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s7
; GFX1064-NEXT: s_mov_b32 s3, s7
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: BB23_2:
@ -4531,11 +4531,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB23_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: BB23_2:

View File

@ -92,7 +92,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GFX1064-NEXT: s_cbranch_execz BB0_3
; GFX1064-NEXT: ; %bb.2:
; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13]
@ -101,7 +101,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX1064-NEXT: BB0_3:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4
@ -328,14 +328,14 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
; GFX1064-NEXT: s_mov_b64 exec, s[10:11]
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GFX1064-NEXT: s_cbranch_execz BB1_3
; GFX1064-NEXT: ; %bb.2:
; GFX1064-NEXT: v_mov_b32_e32 v0, s12
; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX1064-NEXT: BB1_3:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3

View File

@ -233,7 +233,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v7, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -243,7 +243,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid

View File

@ -1121,7 +1121,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x5
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2
@ -1144,9 +1144,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
; GFX10-NEXT: global_store_dword v11, v6, s[0:1] offset:24
; GFX10-NEXT: global_store_dwordx2 v11, v[4:5], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1]
; GFX10-NEXT: global_store_dword v8, v6, s[0:1] offset:24
; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid

View File

@ -90,8 +90,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ds_write_b32 v3, v2 offset:12
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0
; GFX10-NEXT: global_store_dword v[0:1], v7, off
; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
; GFX10-NEXT: global_store_dword v[0:1], v4, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
entry:
@ -340,8 +340,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ds_write2_b32 v4, v2, v3 offset1:1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0
; GFX10-NEXT: global_store_dword v[0:1], v7, off
; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0
; GFX10-NEXT: global_store_dword v[0:1], v5, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0

View File

@ -20,9 +20,7 @@ define { double, double } @testfn(double %arg, double %arg1, double %arg2) {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1]
; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[0:1]
; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3]
; GFX10-NEXT: v_add_f64 v[2:3], -v[2:3], -v[4:5]
; GFX10-NEXT: s_setpc_b64 s[30:31]

View File

@ -970,11 +970,11 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v15, 0
; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v15, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v15, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
@ -989,7 +989,7 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v15, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
double addrspace(1)* %in2) #0 {
%r0 = load double, double addrspace(1)* %in1, align 8
@ -1141,10 +1141,10 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15]
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7]
; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
@ -1299,10 +1299,10 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15]
; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7]
; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
@ -1893,49 +1893,49 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[15:16], v11, s[2:3] offset:32
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v16
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v15
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3
; GFX10-NEXT: v_rcp_f32_e32 v7, v7
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v6, v16, v1
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v16
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v4
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX10-NEXT: v_cvt_f16_f32_e32 v10, v5
; GFX10-NEXT: v_rcp_f32_e32 v7, v7
; GFX10-NEXT: v_div_fixup_f16 v5, v10, v3, v4
; GFX10-NEXT: v_trunc_f16_e32 v10, v5
; GFX10-NEXT: v_fmac_f16_e64 v4, -v10, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v3, v6
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v15
; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v5
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v5
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v15, v0
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v15
; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX10-NEXT: v_rcp_f32_e32 v7, v7
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
@ -1943,7 +1943,7 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; GFX10-NEXT: v_fmac_f16_e64 v0, -v5, v2
; GFX10-NEXT: v_and_b32_e32 v2, v3, v6
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2
; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
<4 x half> addrspace(1)* %in2) #0 {
%gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
@ -2161,11 +2161,11 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v11, s[2:3] offset:32
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1
; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@ -2178,26 +2178,26 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; GFX10-NEXT: v_fma_f32 v8, v9, v7, v8
; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v6, v5, v7, v8
; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v2, v0
; GFX10-NEXT: v_div_fixup_f32 v5, v6, v3, v1
; GFX10-NEXT: v_trunc_f32_e32 v6, v5
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1
; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0
; GFX10-NEXT: v_fma_f32 v1, v3, -v6, v1
; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
; GFX10-NEXT: v_rcp_f32_e32 v6, v5
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0
; GFX10-NEXT: v_fma_f32 v6, v7, v6, v6
; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6
; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v4
; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6
; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3
; GFX10-NEXT: v_fma_f32 v7, v8, v6, v7
; GFX10-NEXT: v_fma_f32 v5, -v5, v7, v4
; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v3, v5, v6, v7
; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7
; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2
; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
<2 x float> addrspace(1)* %in2) #0 {
%gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
@ -2538,11 +2538,11 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[15:18], v8, s[6:7]
; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v18
; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v18, v7, v18
; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3
; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
; GFX10-NEXT: v_rcp_f32_e32 v11, v10
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0
@ -2553,55 +2553,55 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12
; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v17, v6, v17
; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v18
; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3
; GFX10-NEXT: v_trunc_f32_e32 v9, v9
; GFX10-NEXT: v_fma_f32 v18, v7, -v9, v18
; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v17
; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3
; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2
; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
; GFX10-NEXT: v_rcp_f32_e32 v10, v9
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0
; GFX10-NEXT: v_fma_f32 v10, v11, v10, v10
; GFX10-NEXT: v_mul_f32_e32 v11, v0, v10
; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v0
; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10
; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7
; GFX10-NEXT: v_fma_f32 v11, v12, v10, v11
; GFX10-NEXT: v_fma_f32 v1, -v9, v11, v0
; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v7, v1, v10, v11
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v17
; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2
; GFX10-NEXT: v_trunc_f32_e32 v7, v7
; GFX10-NEXT: v_fma_f32 v17, v6, -v7, v17
; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v16
; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v16, v5, v16
; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2
; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1
; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
; GFX10-NEXT: v_rcp_f32_e32 v9, v7
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0
; GFX10-NEXT: v_fma_f32 v9, v10, v9, v9
; GFX10-NEXT: v_mul_f32_e32 v0, v6, v9
; GFX10-NEXT: v_fma_f32 v11, -v7, v0, v6
; GFX10-NEXT: v_fma_f32 v0, v11, v9, v0
; GFX10-NEXT: v_fma_f32 v6, -v7, v0, v6
; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9
; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6
; GFX10-NEXT: v_fma_f32 v10, v11, v9, v10
; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v0
; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v16
; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10
; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1
; GFX10-NEXT: v_trunc_f32_e32 v6, v6
; GFX10-NEXT: v_fma_f32 v16, v5, -v6, v16
; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v15
; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v15, v4, v15
; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1
; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0
; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
; GFX10-NEXT: v_rcp_f32_e32 v7, v6
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0
; GFX10-NEXT: v_fma_f32 v7, v9, v7, v7
; GFX10-NEXT: v_mul_f32_e32 v0, v5, v7
; GFX10-NEXT: v_fma_f32 v10, -v6, v0, v5
; GFX10-NEXT: v_fma_f32 v0, v10, v7, v0
; GFX10-NEXT: v_fma_f32 v5, -v6, v0, v5
; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7
; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5
; GFX10-NEXT: v_fma_f32 v9, v10, v7, v9
; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v0
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v15
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
; GFX10-NEXT: v_fmac_f32_e64 v15, -v5, v4
; GFX10-NEXT: global_store_dwordx4 v8, v[15:18], s[4:5]
; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GFX10-NEXT: s_endpgm
<4 x float> addrspace(1)* %in2) #0 {
%gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
@ -2842,34 +2842,34 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
; GFX10-NEXT: global_load_dwordx4 v[18:21], v16, s[2:3] offset:64
; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[20:21], v[20:21], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[18:19], v[18:19], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[20:21], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[20:21], v[2:3]
; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[20:21], v[2:3]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[18:19], v[0:1]
; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[18:19], v[0:1]
; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[18:19], v[0:1]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
; GFX10-NEXT: s_endpgm
<2 x double> addrspace(1)* %in2) #0 {

View File

@ -843,31 +843,31 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v4, 15, v4
; GFX10-NEXT: v_and_b32_e32 v8, 15, v8
; GFX10-NEXT: v_and_b32_e32 v9, 15, v6
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX10-NEXT: v_and_b32_e32 v15, 15, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10
; GFX10-NEXT: v_and_b32_e32 v19, 15, v6
; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0
; GFX10-NEXT: v_and_b32_e32 v6, 15, v6
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5
; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16 v6, v19, v10
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_and_b32_e32 v7, 15, v11
; GFX10-NEXT: v_and_b32_e32 v2, 15, v5
; GFX10-NEXT: v_or_b32_e32 v11, v6, v4
; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v7, 15, v11
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1
; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3
; GFX10-NEXT: v_lshl_or_b32 v0, v11, 16, v0
; GFX10-NEXT: v_or_b32_e32 v4, v6, v4
; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1
; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
@ -1005,28 +1005,28 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11
; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
; GFX10-NEXT: v_and_b32_e32 v4, 15, v4
; GFX10-NEXT: v_and_b32_e32 v5, 15, v5
; GFX10-NEXT: v_and_b32_e32 v8, 15, v8
; GFX10-NEXT: v_and_b32_e32 v9, 15, v9
; GFX10-NEXT: v_and_b32_e32 v10, 15, v10
; GFX10-NEXT: v_and_b32_e32 v15, 15, v8
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12
; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0
; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: v_or_b32_e32 v3, v7, v6
; GFX10-NEXT: v_or_b32_e32 v7, v5, v4
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
; GFX10-NEXT: v_and_b32_e32 v1, v2, v1
; GFX10-NEXT: v_lshl_or_b32 v0, v7, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
@ -1085,9 +1085,9 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
; GFX10-NEXT: v_not_b32_e32 v5, v4
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
; GFX10-NEXT: v_and_b32_e32 v7, 63, v5
; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -1172,18 +1172,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
; GFX10-NEXT: v_not_b32_e32 v11, v10
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX10-NEXT: v_and_b32_e32 v15, 63, v8
; GFX10-NEXT: v_and_b32_e32 v19, 63, v9
; GFX10-NEXT: v_and_b32_e32 v9, 63, v10
; GFX10-NEXT: v_and_b32_e32 v13, 63, v11
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v15, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v9, v[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[15:16], v13, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v11, v4
; GFX10-NEXT: v_or_b32_e32 v1, v12, v5
; GFX10-NEXT: v_or_b32_e32 v2, v15, v6
; GFX10-NEXT: v_or_b32_e32 v3, v16, v7
; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
ret <2 x i64> %ret
@ -1331,10 +1331,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24
; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6
; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7
; GFX10-NEXT: v_add_nc_u32_e32 v7, 8, v4
; GFX10-NEXT: v_add_nc_u32_e32 v6, 8, v5
; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v7
; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v6
; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4
; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5
; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
ret <2 x i24> %ret

View File

@ -466,8 +466,8 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_mad_f32 v7, -v2, v0, v7
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v2, s0
; GFX10-NEXT: global_store_short v[5:6], v7, off
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, v2, s0
; GFX10-NEXT: global_store_short v[5:6], v2, off
; GFX10-NEXT: s_cbranch_vccz BB4_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@ -546,16 +546,16 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_mul_f32_e32 v8, v7, v1
; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
; GFX10-NEXT: v_trunc_f32_e32 v10, v8
; GFX10-NEXT: v_mad_f32 v7, -v10, v0, v7
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v10
; GFX10-NEXT: v_trunc_f32_e32 v8, v8
; GFX10-NEXT: v_mad_f32 v7, -v8, v0, v7
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8
; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: v_sub_nc_u32_e32 v7, v2, v7
; GFX10-NEXT: global_store_short v[5:6], v7, off
; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7
; GFX10-NEXT: global_store_short v[5:6], v2, off
; GFX10-NEXT: s_cbranch_vccz BB5_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@ -646,8 +646,8 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v0|
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1
; GFX10-NEXT: v_add_nc_u32_e32 v7, v2, v7
; GFX10-NEXT: global_store_short v[5:6], v7, off
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7
; GFX10-NEXT: global_store_short v[5:6], v2, off
; GFX10-NEXT: s_cbranch_vccz BB6_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@ -725,14 +725,14 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v7
; GFX10-NEXT: v_cvt_f32_i32_e32 v5, v7
; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7
; GFX10-NEXT: v_mul_f32_e32 v8, v11, v1
; GFX10-NEXT: v_mul_f32_e32 v8, v5, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6
; GFX10-NEXT: v_trunc_f32_e32 v10, v8
; GFX10-NEXT: v_trunc_f32_e32 v8, v8
; GFX10-NEXT: v_or_b32_e32 v6, 1, v6
; GFX10-NEXT: v_mad_f32 v5, -v10, v0, v11
; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v10
; GFX10-NEXT: v_mad_f32 v5, -v8, v0, v5
; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v8
; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v0|
; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
@ -742,8 +742,8 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
; GFX10-NEXT: v_mul_lo_u32 v2, v2, s1
; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v2
; GFX10-NEXT: global_store_short v[5:6], v7, off
; GFX10-NEXT: v_sub_nc_u32_e32 v2, v7, v2
; GFX10-NEXT: global_store_short v[5:6], v2, off
; GFX10-NEXT: s_cbranch_vccz BB7_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm

View File

@ -2732,11 +2732,11 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v0, v7, v5, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm

View File

@ -341,21 +341,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
; GFX10-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v10, v2, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GFX10-DL-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v6, v6, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_bfe_i32 v4, v8, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v10, v9, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v7, v9, 0, 8
; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3
; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
@ -534,7 +534,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@ -543,7 +543,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
@ -718,14 +718,14 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v5, v0, v3
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v7, v0, v5
; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
@ -908,13 +908,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v7, s2, v0
; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm

View File

@ -327,17 +327,17 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1
; GFX10-DL-NEXT: v_and_b32_e32 v10, s0, v2
; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2
; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5
; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
@ -517,7 +517,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@ -526,7 +526,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
@ -841,7 +841,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@ -850,7 +850,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v11, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4
; GFX10-DL-NEXT: v_mad_u16 v0, v7, v6, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
@ -1025,17 +1025,17 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v7, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
@ -1215,14 +1215,14 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v7, v0, v5
; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
@ -1412,11 +1412,11 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v6, v0, v3, s2
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v6
; GFX10-DL-NEXT: v_add3_u32 v0, v6, v4, v3
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
@ -1622,7 +1622,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v9, v2, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8
; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4
; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@ -1631,7 +1631,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_mad_u16 v3, v6, v9, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
@ -1809,13 +1809,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_add3_u32 v0, v7, s2, v0
; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
@ -2230,7 +2230,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1
; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v3, v1, v2, v3
; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4
; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8
; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@ -2239,7 +2239,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4
; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5
; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]

View File

@ -644,26 +644,26 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
@ -672,13 +672,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
@ -686,13 +686,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@ -722,55 +722,55 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
; GFX10-DL-LABEL: idot8_acc16:
@ -1218,26 +1218,26 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
@ -1246,13 +1246,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
@ -1260,13 +1260,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@ -1296,55 +1296,55 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
; GFX10-DL-LABEL: idot8_acc8:
@ -1713,25 +1713,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v15, v1, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4
; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v14, v0, v7, v5
; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v15, v10
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
; GFX10-DL-XNACK-NEXT: v_add3_u32 v15, v14, v3, v4
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2
; GFX10-DL-XNACK-NEXT: v_add3_u32 v6, v15, v8, v6
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v7, v1, v2
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v6, v3, v4
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v7, v5
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5
; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@ -1765,25 +1765,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v15, v1, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4
; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v15, v10
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v15, v2, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v15, v8, v6
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v3, v2, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v3, v0, v5
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5
; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
; GFX10-DL-LABEL: idot8_multiuses_mul1:
@ -2550,7 +2550,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13
; GFX10-DL-XNACK-NEXT: v_bfe_u32 v19, v2, 24, 4
; GFX10-DL-XNACK-NEXT: v_bfe_u32 v12, v2, 24, 4
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2
; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4
@ -2577,7 +2577,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5
; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v19
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v12
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
@ -2592,9 +2592,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v7, v1, v4
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v7, v5
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
@ -2638,7 +2638,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13
; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v19, v0, 24, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v12, v0, 24, 4
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4
@ -2665,7 +2665,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5
; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v19
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v12
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
@ -2676,11 +2676,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v7, v0, v6
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6
; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v7, v4
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
@ -3196,7 +3196,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v19, 0
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
@ -3207,7 +3207,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v19, s[0:1]
; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
@ -3250,8 +3250,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
@ -3262,13 +3262,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v23, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v23
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@ -3284,12 +3284,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v23, v0
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-XNACK-NEXT: global_store_byte v19, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
@ -3297,7 +3297,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v19, 0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
@ -3308,7 +3308,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v19, s[0:1]
; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
@ -3347,7 +3347,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v23, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17
@ -3360,7 +3360,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12
@ -3390,7 +3390,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: global_store_byte v19, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
; GFX10-DL-LABEL: idot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry

View File

@ -494,31 +494,31 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
@ -812,31 +812,31 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
@ -1134,31 +1134,31 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
@ -1441,31 +1441,31 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4
; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0
; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0
; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3]
@ -2373,49 +2373,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2
; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 8, 4
; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
; GFX10-DL-NEXT: v_bfe_u32 v19, v1, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12
; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6
; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13
; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v6
; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v19
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6
; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v23, 28, v1
; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10
; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12
; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9
; GFX10-DL-NEXT: v_add_nc_u16 v14, v3, v9
; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10
; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6
; GFX10-DL-NEXT: v_add_nc_u16 v3, v14, v7
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v23, 16, v4
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v1
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2
; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
@ -2762,7 +2762,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v19, 0
; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
@ -2773,7 +2773,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: global_load_ubyte v3, v19, s[0:1]
; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@ -2794,7 +2794,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v23, v2, 16, 4
; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4
; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15
; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9
@ -2804,7 +2804,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2
; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v23
; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12
; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7
; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -2820,12 +2820,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v23, v0
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v12, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NEXT: global_store_byte v19, v0, s[0:1]
; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
@ -3115,7 +3115,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 24, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v5
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 8, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7
@ -3133,12 +3132,13 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v8
; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v11, v8
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v8
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4

View File

@ -448,22 +448,22 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa
;
; GFX10-LABEL: load_3d_tfe_lwe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -579,22 +579,22 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace
;
; GFX10-LABEL: load_cube_lwe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -837,22 +837,22 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp
;
; GFX10-LABEL: load_2darray_lwe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -968,22 +968,22 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp
;
; GFX10-LABEL: load_2dmsaa_both:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1361,22 +1361,22 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa
;
; GFX10-LABEL: load_mip_2d_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -566,10 +566,10 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
;
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v6, v4
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
@ -650,14 +650,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10-LABEL: sample_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v7, v5
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -707,9 +707,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -762,8 +762,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -804,10 +804,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v6, v4
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
@ -854,14 +854,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v7, v5
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -911,9 +911,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -966,8 +966,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1162,8 +1162,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1196,8 +1196,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -96,13 +96,13 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10-LABEL: image_sample_2d_f16_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s28, exec_lo
; GFX10-NEXT: s_mov_b32 s14, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v4
; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: v_mov_b32_e32 v3, v5
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v2

View File

@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10-LABEL: sample_1d_tfe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe]
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
@ -92,7 +92,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87]
; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]
@ -499,7 +499,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10-LABEL: sample_1d_lwe:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe]
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
@ -512,7 +512,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87]
; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]

View File

@ -15,12 +15,12 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -58,9 +58,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36]
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -139,12 +139,12 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -166,9 +166,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36]
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -15,12 +15,12 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -58,9 +58,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -139,12 +139,12 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -166,9 +166,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -356,8 +356,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo
; GFX10-32-NEXT: s_cbranch_scc0 BB3_4
; GFX10-32-NEXT: ; %bb.2: ; %.demote
; GFX10-32-NEXT: s_wqm_b32 s28, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-32-NEXT: s_wqm_b32 s14, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10-32-NEXT: BB3_3: ; %.continue
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
@ -379,7 +379,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_wqm_b64 exec, exec
; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15]
; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; GFX10-64-NEXT: s_cbranch_execz BB3_3
; GFX10-64-NEXT: ; %bb.1: ; %.demote
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
@ -388,7 +388,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17]
; GFX10-64-NEXT: BB3_3: ; %.continue
; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
@ -492,8 +492,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo
; GFX10-32-NEXT: s_cbranch_scc0 BB4_4
; GFX10-32-NEXT: ; %bb.2: ; %.demote
; GFX10-32-NEXT: s_wqm_b32 s28, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-32-NEXT: s_wqm_b32 s14, s12
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10-32-NEXT: BB4_3: ; %.continue
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0
@ -515,7 +515,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15]
; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; GFX10-64-NEXT: s_cbranch_execz BB4_3
; GFX10-64-NEXT: ; %bb.1: ; %.demote
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17]
; GFX10-64-NEXT: BB4_3: ; %.continue
; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@ -637,8 +637,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
; GFX10-64-NEXT: s_cbranch_scc0 BB5_2
; GFX10-64-NEXT: ; %bb.1: ; %.entry
; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29]
; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13]
; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15]
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D

View File

@ -157,25 +157,25 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v15, v0, v3
; GFX10-NEXT: v_mul_lo_u32 v4, v0, v3
; GFX10-NEXT: v_mul_hi_u32 v5, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v6, v0, v3
; GFX10-NEXT: v_mul_lo_u32 v8, v1, v2
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v2
; GFX10-NEXT: v_mul_hi_i32 v9, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v11, v1, v3
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v5, v15
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v5, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v10, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v11
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v11, v2
; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v6, v2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_add3_u32 v1, v5, v15, v8
; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo
; GFX10-NEXT: v_add3_u32 v1, v5, v4, v8
; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v6, v0
@ -461,8 +461,8 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30
; GFX10-NEXT: v_ashrrev_i64 v[6:7], 2, v[4:5]
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
; GFX10-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo

View File

@ -539,15 +539,15 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; GFX10-LABEL: v_lshr_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0
; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64

View File

@ -330,12 +330,12 @@ define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrs
; GCN-SCRATCH: ; %bb.0: ; %entry
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, v2
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, v2
; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: global_load_short_d16_hi v6, v[0:1], off
; GCN-SCRATCH-NEXT: global_load_short_d16_hi v5, v[0:1], off
; GCN-SCRATCH-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:64
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v6, off
; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v5, off
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v2, off offset:128
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
@ -373,12 +373,12 @@ define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrs
; GCN-SCRATCH: ; %bb.0: ; %entry
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, v2
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, v2
; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: global_load_short_d16 v6, v[0:1], off
; GCN-SCRATCH-NEXT: global_load_short_d16 v5, v[0:1], off
; GCN-SCRATCH-NEXT: global_load_short_d16 v2, v[0:1], off offset:64
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1)
; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v6, off
; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v5, off
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v2, off offset:128
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0

View File

@ -1,38 +0,0 @@
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
--- |
define amdgpu_kernel void @do_not_reassign_spill() #0 { ret void }
attributes #0 = { "amdgpu-num-vgpr"="8" }
...
# GCN-LABEL: do_not_reassign_spill{{$}}
# GCN: V_AND_B32_e32 killed $vgpr1, killed $vgpr5,
---
name: do_not_reassign_spill
tracksRegLiveness: true
machineFunctionInfo:
stackPtrOffsetReg: $sgpr32
stack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 4 }
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
- { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
- { id: 4, class: vgpr_32, preferred-register: '$vgpr4' }
- { id: 5, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 6, class: vgpr_32 }
body: |
bb.0:
%0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
%1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
%2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
%3 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
%4 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
%5 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5)
S_NOP 0, implicit-def dead $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
%6 = V_AND_B32_e32 %1, %5, implicit $exec
S_ENDPGM 0, implicit %6
...

View File

@ -1,69 +0,0 @@
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
# Test that subreg reassignments are correctly handled when whole register also
# conflicts. If this is mishandled stall counts will be incorrect and cause an
# infinite loop.
# GCN-LABEL: vgpr64_mixed_use{{$}}
# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF
# GCN: $vcc = IMPLICIT_DEF
# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF
# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF
# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec
# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec
# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 killed $vgpr4_vgpr5, killed $vgpr0_vgpr1, implicit $exec
---
name: vgpr64_mixed_use
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
- { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' }
- { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' }
- { id: 3, class: vgpr_32 }
- { id: 4, class: vgpr_32 }
- { id: 5, class: sreg_64_xexec }
- { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
- { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
- { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
- { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
- { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
- { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
- { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
- { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
- { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
- { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
- { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
- { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = IMPLICIT_DEF
%6 = IMPLICIT_DEF
%7 = IMPLICIT_DEF
%8 = IMPLICIT_DEF
%9 = IMPLICIT_DEF
%10 = IMPLICIT_DEF
%11 = IMPLICIT_DEF
%12 = IMPLICIT_DEF
%13 = IMPLICIT_DEF
%14 = IMPLICIT_DEF
%15 = IMPLICIT_DEF
%16 = IMPLICIT_DEF
%17 = IMPLICIT_DEF
%3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec
%4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec
%5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec
S_ENDPGM 0
...

View File

@ -1,611 +0,0 @@
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
# GCN-LABEL: v1_vs_v5{{$}}
# GCN: V_AND_B32_e32 killed $vgpr3, killed $vgpr1,
---
name: v1_vs_v5
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = V_AND_B32_e32 %1, %0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: v0_1_vs_v4{{$}}
# GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr3,
---
name: v0_1_vs_v4
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr4' }
- { id: 1, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %1, %0, 0, 0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: v1_2_vs_v4_5{{$}}
# GCN: GLOBAL_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5,
---
name: v1_2_vs_v4_5
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64, preferred-register: '$vgpr4_vgpr5' }
- { id: 1, class: vreg_64, preferred-register: '$vgpr1_vgpr2' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: s11_vs_vcc{{$}}
# GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed $sgpr14, killed $vgpr0, killed $vcc_lo, 0
---
name: s11_vs_vcc
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_32, preferred-register: '$sgpr11' }
- { id: 1, class: vgpr_32 }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
$vcc_lo = IMPLICIT_DEF
%2, $vcc_lo = V_ADDC_U32_e64 killed %0, killed %1, killed $vcc_lo, 0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: s0_vs_s16{{$}}
# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr0,
---
name: s0_vs_s16
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
- { id: 1, class: sgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
$sgpr0 = IMPLICIT_DEF
%1 = S_AND_B32 %0, $sgpr0, implicit-def $scc
S_ENDPGM 0
...
# GCN-LABEL: s1_vs_s16{{$}}
# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr1,
---
name: s1_vs_s16
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
- { id: 1, class: sgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
$sgpr1 = IMPLICIT_DEF
%1 = S_AND_B32 %0, $sgpr1, implicit-def $scc
S_ENDPGM 0
...
# GCN-LABEL: s12_vs_null{{$}}
# GCN: S_AND_B32 $sgpr_null, killed renamable $sgpr14,
---
name: s12_vs_null
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_32, preferred-register: '$sgpr12' }
- { id: 1, class: sgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = S_AND_B32 $sgpr_null, %0, implicit-def $scc
S_ENDPGM 0
...
# GCN-LABEL: s13_vs_m0{{$}}
# GCN: S_AND_B32 $m0, killed renamable $sgpr14,
---
name: s13_vs_m0
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_32, preferred-register: '$sgpr13' }
- { id: 1, class: sgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = S_AND_B32 $m0, %0, implicit-def $scc
S_ENDPGM 0
...
# GCN-LABEL: s12_13_vs_s28_s29{{$}}
# GCN: S_AND_B64 $sgpr28_sgpr29, killed renamable $sgpr14_sgpr15,
---
name: s12_13_vs_s28_s29
tracksRegLiveness: true
registers:
- { id: 0, class: sreg_64, preferred-register: '$sgpr12_sgpr13' }
- { id: 1, class: sreg_64 }
body: |
bb.0:
%0 = IMPLICIT_DEF
$sgpr28_sgpr29 = IMPLICIT_DEF
%1 = S_AND_B64 $sgpr28_sgpr29, %0, implicit-def $scc
S_ENDPGM 0
...
# GCN-LABEL: livein{{$}}
# GCN: V_AND_B32_e32 killed $vgpr4, killed $vgpr0,
---
name: livein
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
- { id: 2, class: vgpr_32 }
liveins:
- { reg: '$vgpr0', virtual-reg: '' }
- { reg: '$vgpr4', virtual-reg: '' }
body: |
bb.0:
liveins: $vgpr0, $vgpr4
%0 = COPY $vgpr0
%1 = COPY $vgpr4
%2 = V_AND_B32_e32 %1, %0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: liveout{{$}}
# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
---
name: liveout
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = V_AND_B32_e32 %1, %0, implicit $exec
$vgpr0 = COPY %0
$vgpr4 = COPY %1
S_ENDPGM 0
...
# GCN-LABEL: implicit{{$}}
# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0
---
name: implicit
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_128 }
- { id: 1, class: vreg_128, preferred-register: '$vgpr4_vgpr5_vgpr6_vgpr7' }
body: |
bb.0:
%1 = IMPLICIT_DEF
V_MOV_B32_indirect undef %1.sub0:vreg_128, undef $vgpr0, implicit $exec, implicit-def %0:vreg_128, implicit %1:vreg_128, implicit $m0
S_ENDPGM 0
...
# GCN-LABEL: occupancy_limit{{$}}
# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
---
name: occupancy_limit
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
- { id: 2, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
- { id: 4, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
- { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
- { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
- { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
- { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
- { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
- { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
- { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
- { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
- { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
- { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%3 = IMPLICIT_DEF
%4 = IMPLICIT_DEF
%5 = IMPLICIT_DEF
%6 = IMPLICIT_DEF
%7 = IMPLICIT_DEF
%8 = IMPLICIT_DEF
%9 = IMPLICIT_DEF
%10 = IMPLICIT_DEF
%11 = IMPLICIT_DEF
%12 = IMPLICIT_DEF
%13 = IMPLICIT_DEF
%14 = IMPLICIT_DEF
%15 = IMPLICIT_DEF
%2 = V_AND_B32_e32 %1, %0, implicit $exec
GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec
GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec
GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec
GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: csr{{$}}
# GCN: V_AND_B32_e32 $vgpr37, $vgpr0,
---
name: csr
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
- { id: 2, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
- { id: 4, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
- { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
- { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
- { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
- { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
- { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
- { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
- { id: 12, class: vgpr_32, preferred-register: '$vgpr33' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%3 = IMPLICIT_DEF
%4 = IMPLICIT_DEF
%5 = IMPLICIT_DEF
%6 = IMPLICIT_DEF
%7 = IMPLICIT_DEF
%8 = IMPLICIT_DEF
%9 = IMPLICIT_DEF
%10 = IMPLICIT_DEF
%11 = IMPLICIT_DEF
%12 = IMPLICIT_DEF
%2 = V_AND_B32_e32 %1, %0, implicit $exec
GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec
GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec
GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec
GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, implicit $exec
GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, implicit $exec
GLOBAL_STORE_DWORD %3, %12, 0, 0, implicit $exec
S_ENDPGM 0
...
# Do not touch undefs
# GCN-LABEL: s0_vs_s16_undef{{$}}
# GCN: S_AND_B32 killed renamable $sgpr16, undef $sgpr0,
---
name: s0_vs_s16_undef
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
- { id: 1, class: sgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = S_AND_B32 %0, undef $sgpr0, implicit-def $scc
S_ENDPGM 0
...
# GCN-LABEL: smem_bundle{{$}}
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0
---
name: smem_bundle
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_128, preferred-register: '$sgpr0_sgpr1_sgpr2_sgpr3' }
- { id: 1, class: sreg_32_xm0_xexec, preferred-register: '$sgpr16' }
- { id: 2, class: sreg_32_xm0_xexec, preferred-register: '$sgpr17' }
- { id: 3, class: sreg_32_xm0_xexec, preferred-register: '$sgpr4' }
- { id: 4, class: sreg_32_xm0_xexec, preferred-register: '$sgpr5' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = IMPLICIT_DEF
early-clobber %3, early-clobber %4 = BUNDLE %0, %1, %2 {
%3 = S_BUFFER_LOAD_DWORD_SGPR %0, %1, 0
%4 = S_BUFFER_LOAD_DWORD_SGPR %0, %2, 0
}
S_ENDPGM 0
...
# GCN-LABEL: vreg_512_subs{{$}}
# don't care about the assignment: this used to trigger an infinite loop
---
name: vreg_512_subs
tracksRegLiveness: true
registers:
- { id: 1, class: vreg_512, preferred-register: '$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15' }
- { id: 2, class: vgpr_32, preferred-register: '$vgpr28' }
body: |
bb.0:
%1 = IMPLICIT_DEF
%2 = IMPLICIT_DEF
DS_WRITE2_B32_gfx9 %2, %1.sub0, %1.sub1, 0, 1, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub2, %1.sub3, 2, 3, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec
DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: vgpr_lo16_sub{{$}}
# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
# GCN: renamable $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16
---
name: vgpr_lo16_sub
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_lo16 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = V_AND_B32_e32 %1, %0, implicit $exec
%3 = COPY %2.lo16
$vgpr1_lo16 = COPY %3
SI_RETURN_TO_EPILOG $vgpr1_lo16
...
# GCN-LABEL: vgpr_lo16{{$}}
# GCN: $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16
---
name: vgpr_lo16
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_lo16, preferred-register: '$vgpr4_lo16' }
body: |
bb.0:
liveins: $vgpr0_lo16
%0 = COPY $vgpr0_lo16
$vgpr1_lo16 = COPY %0
SI_RETURN_TO_EPILOG $vgpr1_lo16
...
# GCN-LABEL: vgpr_hi16_sub{{$}}
# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec
# GCN: renamable $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16
---
name: vgpr_hi16_sub
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 2, class: vgpr_32 }
- { id: 3, class: vgpr_hi16 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = V_AND_B32_e32 %1, %0, implicit $exec
%3 = COPY %2.hi16
$vgpr1_hi16 = COPY %3
SI_RETURN_TO_EPILOG $vgpr1_hi16
...
# GCN-LABEL: vgpr_hi16{{$}}
# GCN: $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16
---
name: vgpr_hi16
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_hi16, preferred-register: '$vgpr4_hi16' }
body: |
bb.0:
liveins: $vgpr0_hi16
%0 = COPY $vgpr0_hi16
$vgpr1_hi16 = COPY %0
SI_RETURN_TO_EPILOG $vgpr1_hi16
...
# GCN-LABEL: sgpr_lo16_sub{{$}}
# GCN: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr14, $sgpr0, implicit-def $scc
# GCN: renamable $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16
---
name: sgpr_lo16_sub
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
- { id: 1, class: sgpr_32 }
- { id: 2, class: sgpr_lo16 }
body: |
bb.0:
%0 = IMPLICIT_DEF
$sgpr0 = IMPLICIT_DEF
%1 = S_AND_B32 %0, $sgpr0, implicit-def $scc
%2 = COPY %1.lo16
$sgpr1_lo16 = COPY %2
SI_RETURN_TO_EPILOG $sgpr1_lo16
...
# GCN-LABEL: sgpr_lo16{{$}}
# GCN: $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16
---
name: sgpr_lo16
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_lo16, preferred-register: '$sgpr4_lo16' }
body: |
bb.0:
liveins: $sgpr0_lo16
%0 = COPY $sgpr0_lo16
$sgpr1_lo16 = COPY %0
SI_RETURN_TO_EPILOG $sgpr1_lo16
...
# Check that we do not use VGPR3 which we would use otherwise.
# We cannot use it because of interference with VGPR3_LO16.
# GCN-LABEL: v1_vs_v5_src_interence{{$}}
# GCN: V_AND_B32_e32 killed $vgpr7, killed $vgpr1,
---
name: v1_vs_v5_src_interence
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
$vgpr3_lo16 = IMPLICIT_DEF
%2 = V_AND_B32_e32 %1, %0, implicit $exec
S_ENDPGM 0
...
# Test that bank of subreg is considered during scavenging.
# If handled incorrectly an infinite loop occurs.
# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}}
# GCN: S_AND_B32 killed renamable $sgpr13, $sgpr0,
---
name: s0_vs_s15_16_17_sub1
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' }
- { id: 1, class: sgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
$sgpr0 = IMPLICIT_DEF
%1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc
S_ENDPGM 0
...
# Test that the size of subreg is correctly handled in bank calculation.
# If handled incorrectly an infinite loop occurs.
# GCN-LABEL: vgpr_sub_dependence{{$}}
# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF
# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF
# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF
# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF
# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF
# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec
# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, killed $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec
---
name: vgpr_sub_dependence
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' }
- { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' }
- { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' }
- { id: 3, class: vreg_64 }
- { id: 4, class: vreg_64 }
- { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
- { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' }
- { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' }
- { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' }
- { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
- { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
- { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
- { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
- { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
- { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
- { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = IMPLICIT_DEF
%5 = IMPLICIT_DEF
%6 = IMPLICIT_DEF
%7 = IMPLICIT_DEF
%8 = IMPLICIT_DEF
%9 = IMPLICIT_DEF
%10 = IMPLICIT_DEF
%11 = IMPLICIT_DEF
%12 = IMPLICIT_DEF
%13 = IMPLICIT_DEF
%14 = IMPLICIT_DEF
%15 = IMPLICIT_DEF
%3 = V_ADD_F64_e64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
%4 = V_ADD_F64_e64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
...
# GCN-LABEL: dbg_value_v1_v5{{$}}
# GCN: renamable $vgpr1 = IMPLICIT_DEF
# GCN: renamable $vgpr5 = IMPLICIT_DEF
---
name: dbg_value_v1_v5
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
DBG_VALUE debug-use %1, debug-use %0
S_ENDPGM 0, implicit %0, implicit %1
...
# GCN-LABEL: kill_v1_v5{{$}}
# GCN: renamable $vgpr1 = IMPLICIT_DEF
# GCN: renamable $vgpr5 = IMPLICIT_DEF
# GCN: KILL killed renamable $vgpr5, killed renamable $vgpr1
---
name: kill_v1_v5
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
- { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
- { id: 2, class: vgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
KILL %1, %0
S_ENDPGM 0
...

View File

@ -458,16 +458,16 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; GFX10-NEXT: v_mov_b32_e32 v6, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[9:10], v6, s[8:9]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v9, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v10, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[7:8], v[9:10]
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX10-NEXT: global_store_dwordx2 v6, v[7:8], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5]
; GFX10-NEXT: global_store_byte v6, v0, s[6:7]
; GFX10-NEXT: s_endpgm
%a = load i64, i64 addrspace(1)* %aptr, align 4
@ -575,14 +575,14 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_i32 v5, v1, v3 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v10, v1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_add_nc_i32 v6, v0, v2 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v9, v0, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v5
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v6
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v4, v[9:10], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
; GFX10-NEXT: s_endpgm
%a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4

View File

@ -486,17 +486,17 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_bfrev_b32_e32 v6, -2
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11
; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result

View File

@ -544,15 +544,15 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
; GFX10-LABEL: v_shl_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64

View File

@ -1100,17 +1100,17 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_bfrev_b32_e32 v6, -2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3]
; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11
; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5]
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result

View File

@ -230,12 +230,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: s_lshr_b32 s1, s7, 24
; GFX10-NEXT: s_lshr_b32 s5, s5, 24
; GFX10-NEXT: v_mov_b32_e32 v15, s3
; GFX10-NEXT: v_mov_b32_e32 v8, s3
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: v_mov_b32_e32 v9, s6
; GFX10-NEXT: s_lshr_b32 s0, s4, 8
; GFX10-NEXT: v_mov_b32_e32 v6, s1
; GFX10-NEXT: v_mov_b32_e32 v11, s4
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: ds_write_b8 v0, v1 offset:12
; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14
@ -243,8 +243,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10
; GFX10-NEXT: ds_write_b8 v0, v3 offset:4
; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6
; GFX10-NEXT: ds_write_b8 v0, v11
; GFX10-NEXT: ds_write_b8_d16_hi v0, v11 offset:2
; GFX10-NEXT: ds_write_b8 v0, v4
; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2
; GFX10-NEXT: ds_write_b8 v0, v5 offset:13
; GFX10-NEXT: ds_write_b8 v0, v6 offset:15
; GFX10-NEXT: ds_write_b8 v0, v7 offset:9
@ -252,7 +252,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: ds_write_b8 v0, v15 offset:11
; GFX10-NEXT: ds_write_b8 v0, v8 offset:11
; GFX10-NEXT: ds_write_b8 v0, v9 offset:5
; GFX10-NEXT: ds_write_b8 v0, v1 offset:7
; GFX10-NEXT: ds_write_b8 v0, v2 offset:1
@ -351,15 +351,15 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: v_mov_b32_e32 v1, s7
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_mov_b32_e32 v7, s4
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: ds_write_b16 v0, v1 offset:12
; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14
; GFX10-NEXT: ds_write_b16 v0, v2 offset:8
; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10
; GFX10-NEXT: ds_write_b16 v0, v3 offset:4
; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6
; GFX10-NEXT: ds_write_b16 v0, v7
; GFX10-NEXT: ds_write_b16_d16_hi v0, v7 offset:2
; GFX10-NEXT: ds_write_b16 v0, v4
; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2
; GFX10-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
ret void
@ -420,9 +420,9 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
; GFX10-NEXT: v_mov_b32_e32 v1, s4
; GFX10-NEXT: v_mov_b32_e32 v2, s5
; GFX10-NEXT: v_mov_b32_e32 v3, s6
; GFX10-NEXT: v_mov_b32_e32 v6, s7
; GFX10-NEXT: v_mov_b32_e32 v4, s7
; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX10-NEXT: ds_write2_b32 v0, v3, v6 offset0:2 offset1:3
; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
; GFX10-NEXT: s_endpgm
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
ret void

View File

@ -196,11 +196,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: s_lshr_b32 s5, s4, 8
; GFX10-NEXT: v_mov_b32_e32 v3, s4
; GFX10-NEXT: s_lshr_b32 s4, s4, 24
; GFX10-NEXT: v_mov_b32_e32 v11, s0
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: v_mov_b32_e32 v7, s3
; GFX10-NEXT: v_mov_b32_e32 v15, s5
; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: v_mov_b32_e32 v9, s4
; GFX10-NEXT: ds_write_b8 v0, v1 offset:8
; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10
@ -208,11 +208,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
; GFX10-NEXT: ds_write_b8 v0, v3
; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2
; GFX10-NEXT: ds_write_b8 v0, v11 offset:9
; GFX10-NEXT: ds_write_b8 v0, v4 offset:9
; GFX10-NEXT: ds_write_b8 v0, v5 offset:11
; GFX10-NEXT: ds_write_b8 v0, v6 offset:5
; GFX10-NEXT: ds_write_b8 v0, v7 offset:7
; GFX10-NEXT: ds_write_b8 v0, v15 offset:1
; GFX10-NEXT: ds_write_b8 v0, v8 offset:1
; GFX10-NEXT: ds_write_b8 v0, v9 offset:3
; GFX10-NEXT: s_endpgm
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1

View File

@ -65,12 +65,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_strict(<2 x double> %x, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7]
; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5]
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x double> %val
@ -88,12 +84,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_ignore(<2 x double> %x, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7]
; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5]
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
ret <2 x double> %val
@ -111,12 +103,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_maytrap(<2 x double> %x, <
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7]
; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5]
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
ret <2 x double> %val

View File

@ -75,10 +75,9 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v6, v5
; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3
; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
; GFX10-NEXT: v_fmac_f16_e32 v6, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, v6
; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <3 x half> %val
@ -128,23 +127,21 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v14, v5
; GFX10-NEXT: v_mov_b32_e32 v15, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v14
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; GFX10-NEXT: v_fmac_f16_e32 v15, v0, v2
; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_fmac_f16_e32 v14, v1, v3
; GFX10-NEXT: v_fmac_f16_e32 v5, v8, v7
; GFX10-NEXT: v_fmac_f16_e32 v4, v11, v10
; GFX10-NEXT: v_and_b32_e32 v1, v0, v15
; GFX10-NEXT: v_and_b32_e32 v2, v0, v14
; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2
; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3
; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7
; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
; GFX10-NEXT: v_and_b32_e32 v2, v0, v5
; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <4 x half> %val

View File

@ -31,12 +31,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v13, v3
; GFX10-NEXT: v_mov_b32_e32 v12, v2
; GFX10-NEXT: v_mov_b32_e32 v15, v1
; GFX10-NEXT: v_mov_b32_e32 v14, v0
; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11]
; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x double> %val
@ -77,18 +73,10 @@ define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v29, v7
; GFX10-NEXT: v_mov_b32_e32 v28, v6
; GFX10-NEXT: v_mov_b32_e32 v31, v5
; GFX10-NEXT: v_mov_b32_e32 v30, v4
; GFX10-NEXT: v_mov_b32_e32 v25, v3
; GFX10-NEXT: v_mov_b32_e32 v24, v2
; GFX10-NEXT: v_mov_b32_e32 v27, v1
; GFX10-NEXT: v_mov_b32_e32 v26, v0
; GFX10-NEXT: v_fma_f64 v[4:5], v[30:31], v[12:13], v[20:21]
; GFX10-NEXT: v_fma_f64 v[6:7], v[28:29], v[14:15], v[22:23]
; GFX10-NEXT: v_fma_f64 v[2:3], v[24:25], v[10:11], v[18:19]
; GFX10-NEXT: v_fma_f64 v[0:1], v[26:27], v[8:9], v[16:17]
; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <4 x double> %val
@ -162,12 +150,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x doubl
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v13, v3
; GFX10-NEXT: v_mov_b32_e32 v12, v2
; GFX10-NEXT: v_mov_b32_e32 v15, v1
; GFX10-NEXT: v_mov_b32_e32 v14, v0
; GFX10-NEXT: v_fma_f64 v[2:3], -v[12:13], -v[6:7], v[10:11]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[14:15], -v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg <2 x double> %x
%neg.y = fneg <2 x double> %y

View File

@ -65,12 +65,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_strict(<2 x double> %x, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7]
; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5]
; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x double> %val
@ -88,12 +84,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_ignore(<2 x double> %x, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7]
; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5]
; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
ret <2 x double> %val
@ -111,12 +103,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_maytrap(<2 x double> %x, <
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7]
; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5]
; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
ret <2 x double> %val

View File

@ -65,12 +65,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_strict(<2 x double> %x, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7]
; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5]
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x double> %val
@ -88,12 +84,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_ignore(<2 x double> %x, <2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7]
; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5]
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
ret <2 x double> %val
@ -111,12 +103,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_maytrap(<2 x double> %x, <
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v0
; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7]
; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5]
; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
ret <2 x double> %val

View File

@ -746,11 +746,11 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_and_b32_sdwa v1, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -778,15 +778,15 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx2 v[9:10], v[2:3], off
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_and_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_lshl_or_b32 v0, v10, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v1, v9, 16, v2
; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@ -816,12 +816,12 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -1319,14 +1319,14 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[7:8], v6, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_fma_f16 v4, v7, v2, v4 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v2, v8, v2, v5 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v0, v7, v3, v4 op_sel:[1,0,0]
; GFX10-NEXT: v_pk_fma_f16 v1, v8, v3, v2 op_sel:[1,0,0]
; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9]
; GFX10-NEXT: s_endpgm
entry:
@ -1380,14 +1380,16 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_and_b32_e32 v1, v3, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v1, v7, 16, v2
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1