diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 7220616c2b79..a38d0a779bd6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -74,16 +74,6 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); -namespace AMDGPU { -enum RegBankReassignMode { - RM_VGPR = 1, - RM_SGPR = 2, - RM_BOTH = RM_VGPR | RM_SGPR -}; -} -MachineFunctionPass * -createGCNRegBankReassignPass(AMDGPU::RegBankReassignMode Mode); - struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); @@ -342,9 +332,6 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; -void initializeGCNRegBankReassignPass(PassRegistry &); -extern char &GCNRegBankReassignID; - void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b7fcffb24473..b50e0eb8b87f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -262,7 +262,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); - initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); } @@ -1177,10 +1176,8 @@ void GCNPassConfig::addOptimizedRegAlloc() { } bool GCNPassConfig::addPreRewrite() { - if (EnableRegReassign) { + if (EnableRegReassign) addPass(&GCNNSAReassignID); - addPass(createGCNRegBankReassignPass(AMDGPU::RM_BOTH)); - } return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 4a4fee56539d..41d58d5b76b5 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -139,7 +139,6 @@ add_llvm_target(AMDGPUCodeGen SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp - GCNRegBankReassign.cpp GCNNSAReassign.cpp GCNDPPCombine.cpp SIModeRegister.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp deleted file mode 100644 index b877ef9be660..000000000000 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ /dev/null @@ -1,900 +0,0 @@ -//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Try to reassign registers on GFX10+ to reduce register bank -/// conflicts. -/// -/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in -/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to -/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, -/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. -/// -/// The shader can read one dword from each of these banks once per cycle. -/// If an instruction has to read more register operands from the same bank -/// an additional cycle is needed. HW attempts to pre-load registers through -/// input operand gathering, but a stall cycle may occur if that fails. For -/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, -/// potentially incuring 2 stall cycles. -/// -/// The pass tries to reassign registers to reduce bank conflicts. -/// -/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so -/// that 4 has to be subtracted from an SGPR bank number to get the real value. -/// This also corresponds to bit numbers in bank masks used in the pass. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveRegMatrix.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; -using namespace AMDGPU; - -static cl::opt VerifyStallCycles("amdgpu-verify-regbanks-reassign", - cl::desc("Verify stall cycles in the regbanks reassign pass"), - cl::value_desc("0|1|2"), - cl::init(0), cl::Hidden); - -// Threshold to keep compile time reasonable. -static cl::opt VRegThresh("amdgpu-regbanks-reassign-threshold", - cl::desc("Max number of vregs to run the regbanks reassign pass"), - cl::init(15000), cl::Hidden); - -#define DEBUG_TYPE "amdgpu-regbanks-reassign" - -#define NUM_VGPR_BANKS 4 -#define NUM_SGPR_BANKS 8 -#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) -#define SGPR_BANK_OFFSET NUM_VGPR_BANKS -#define VGPR_BANK_MASK 0xf -#define SGPR_BANK_MASK 0xff0 -#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) - -STATISTIC(NumStallsDetected, - "Number of operand read stalls detected"); -STATISTIC(NumStallsRecovered, - "Number of operand read stalls recovered"); - -namespace { - -class GCNRegBankReassign : public MachineFunctionPass { - - class OperandMask { - public: - OperandMask(unsigned r, unsigned s, unsigned m) - : Reg(r), SubReg(s), Mask(m) {} - Register Reg; - unsigned SubReg; - unsigned Mask; - }; - - class Candidate { - public: - Candidate(MachineInstr *mi, Register reg, unsigned subreg, - unsigned freebanks) - : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(const GCNRegBankReassign *P) const { - MI->dump(); - dbgs() << P->printReg(Reg) << " to banks "; - dumpFreeBanks(FreeBanks); - dbgs() << '\n'; - } -#endif - - MachineInstr *MI; - Register Reg; - unsigned SubReg; - unsigned FreeBanks; - }; - - class CandidateList : public std::map> { - public: - void push(unsigned Weight, const Candidate&& C) { - operator[](Weight).push_front(C); - } - - Candidate &back() { - return rbegin()->second.back(); - } - - void pop_back() { - rbegin()->second.pop_back(); - if (rbegin()->second.empty()) - erase(rbegin()->first); - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump(const GCNRegBankReassign *P) const { - dbgs() << "\nCandidates:\n\n"; - for (auto &B : *this) { - dbgs() << " Weight " << B.first << ":\n"; - for (auto &C : B.second) - C.dump(P); - } - dbgs() << "\n\n"; - } -#endif - }; - -public: - static char ID; - -public: - GCNRegBankReassign(RegBankReassignMode Mode = RM_BOTH) - : MachineFunctionPass(ID), Mode(Mode) { - initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "GCN RegBank Reassign"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - const GCNSubtarget *ST; - - const MachineRegisterInfo *MRI; - - const SIRegisterInfo *TRI; - - MachineLoopInfo *MLI; - - VirtRegMap *VRM; - - LiveRegMatrix *LRM; - - LiveIntervals *LIS; - - RegBankReassignMode Mode; - - unsigned MaxNumVGPRs; - - unsigned MaxNumSGPRs; - - BitVector RegsUsed; - - SmallVector OperandMasks; - - CandidateList Candidates; - - const MCPhysReg *CSRegs; - - // Returns bank for a phys reg. - unsigned getPhysRegBank(Register Reg, unsigned SubReg) const; - - // Return a bit set for each register bank used. 4 banks for VGPRs and - // 8 banks for SGPRs. - // Registers already processed and recorded in RegsUsed are excluded. - // If Bank is not -1 assume Reg:SubReg to belong to that Bank. - uint32_t getRegBankMask(Register Reg, unsigned SubReg, int Bank); - - // Analyze one instruction returning the number of stalls and a mask of the - // banks used by all operands. - // If Reg and Bank are provided, assume all uses of Reg will be replaced with - // a register chosen from Bank. - std::pair analyzeInst(const MachineInstr &MI, - Register Reg = Register(), - unsigned SubReg = 0, int Bank = -1); - - // Return true if register is regular VGPR or SGPR or their tuples. - // Returns false for special registers like m0, vcc etc. - bool isReassignable(Register Reg) const; - - // Check if registers' defs are old and may be pre-loaded. - // Returns 0 if both registers are old enough, 1 or 2 if one or both - // registers will not likely be pre-loaded. - unsigned getOperandGatherWeight(const MachineInstr& MI, - Register Reg1, - Register Reg2, - unsigned StallCycles) const; - - - // Find all bank bits in UsedBanks where Mask can be relocated to. - unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; - - // Find all bank bits in UsedBanks where Mask can be relocated to. - // Bank is relative to the register and not its subregister component. - // Returns 0 is a register is not reassignable. - unsigned getFreeBanks(Register Reg, unsigned SubReg, unsigned Mask, - unsigned UsedBanks) const; - - // Add cadidate instruction to the work list. - void collectCandidates(MachineInstr& MI, unsigned UsedBanks, - unsigned StallCycles); - - // Collect cadidate instructions across function. Returns a number stall - // cycles detected. Only counts stalls if Collect is false. - unsigned collectCandidates(MachineFunction &MF, bool Collect = true); - - // Remove all candidates that read specified register. - void removeCandidates(Register Reg); - - // Compute stalls within the uses of SrcReg replaced by a register from - // Bank. If Bank is -1 does not perform substitution. If Collect is set - // candidates are collected and added to work list. - unsigned computeStallCycles(Register SrcReg, - Register Reg = Register(), - unsigned SubReg = 0, int Bank = -1, - bool Collect = false); - - // Search for a register in Bank unused within LI. - // Returns phys reg or NoRegister. - MCRegister scavengeReg(LiveInterval &LI, unsigned Bank, - unsigned SubReg) const; - - // Try to reassign candidate. Returns number or stall cycles saved. - unsigned tryReassign(Candidate &C); - - bool verifyCycles(MachineFunction &MF, - unsigned OriginalCycles, unsigned CyclesSaved); - - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -public: - Printable printReg(Register Reg, unsigned SubReg = 0) const { - return Printable([Reg, SubReg, this](raw_ostream &OS) { - if (Reg.isPhysical()) { - OS << llvm::printReg(Reg, TRI); - return; - } - if (!VRM->isAssignedReg(Reg)) - OS << " " << llvm::printReg(Reg, TRI); - else - OS << llvm::printReg(Reg, TRI) << '(' - << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; - if (SubReg) - OS << ':' << TRI->getSubRegIndexName(SubReg); - }); - } - - static Printable printBank(unsigned Bank) { - return Printable([Bank](raw_ostream &OS) { - OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); - }); - } - - static void dumpFreeBanks(unsigned FreeBanks) { - for (unsigned L = 0; L < NUM_BANKS; ++L) - if (FreeBanks & (1 << L)) - dbgs() << printBank(L) << ' '; - } -#endif -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", - false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) -INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", - false, false) - - -char GCNRegBankReassign::ID = 0; - -char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; - -unsigned GCNRegBankReassign::getPhysRegBank(Register Reg, - unsigned SubReg) const { - assert(Reg.isPhysical()); - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - unsigned Size = TRI->getRegSizeInBits(*RC); - if (Size == 16) - Reg = TRI->get32BitRegister(Reg); - else if (Size > 32) { - if (SubReg) { - const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); - Reg = TRI->getSubReg(Reg, SubReg); - if (TRI->getRegSizeInBits(*SubRC) > 32) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } else { - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } - } - - if (TRI->hasVGPRs(RC)) { - unsigned RegNo = Reg - AMDGPU::VGPR0; - return RegNo % NUM_VGPR_BANKS; - } - - unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; - return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; -} - -uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg, - int Bank) { - if (Reg.isVirtual()) { - if (!VRM->isAssignedReg(Reg)) - return 0; - - Reg = VRM->getPhys(Reg); - if (!Reg) - return 0; - if (SubReg) - Reg = TRI->getSubReg(Reg, SubReg); - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - unsigned Size = TRI->getRegSizeInBits(*RC); - - if (Size == 16) { - Reg = TRI->get32BitRegister(Reg); - Size = 1; - } else { - Size /= 32; - if (Size > 1) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); - } - - if (TRI->hasVGPRs(RC)) { - // VGPRs have 4 banks assigned in a round-robin fashion. - unsigned RegNo = Reg - AMDGPU::VGPR0; - uint32_t Mask = maskTrailingOnes(Size); - unsigned Used = 0; - // Bitmask lacks an extract method - for (unsigned I = 0; I < Size; ++I) - if (RegsUsed.test(RegNo + I)) - Used |= 1 << I; - RegsUsed.set(RegNo, RegNo + Size); - Mask &= ~Used; - Mask <<= (Bank == -1) ? RegNo % NUM_VGPR_BANKS : uint32_t(Bank); - return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; - } - - // SGPRs have 8 banks holding 2 consequitive registers each. - unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2; - unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); - if (RegNo + StartBit >= RegsUsed.size()) - return 0; - - if (Size > 1) - Size /= 2; - unsigned Mask = (1 << Size) - 1; - unsigned Used = 0; - for (unsigned I = 0; I < Size; ++I) - if (RegsUsed.test(StartBit + RegNo + I)) - Used |= 1 << I; - RegsUsed.set(StartBit + RegNo, StartBit + RegNo + Size); - Mask &= ~Used; - Mask <<= (Bank == -1) ? RegNo % NUM_SGPR_BANKS - : unsigned(Bank - SGPR_BANK_OFFSET); - Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; - // Reserve 4 bank ids for VGPRs. - return Mask << SGPR_BANK_OFFSET; -} - -std::pair -GCNRegBankReassign::analyzeInst(const MachineInstr &MI, Register Reg, - unsigned SubReg, int Bank) { - unsigned StallCycles = 0; - unsigned UsedBanks = 0; - - if (MI.isMetaInstruction()) - return std::make_pair(StallCycles, UsedBanks); - - if (!(Mode & RM_SGPR) && - MI.getDesc().TSFlags & (SIInstrFlags::SMRD | SIInstrFlags::SALU)) - return std::make_pair(StallCycles, UsedBanks); - - RegsUsed.reset(); - OperandMasks.clear(); - for (const auto& Op : MI.explicit_uses()) { - // Undef can be assigned to any register, so two vregs can be assigned - // the same phys reg within the same instruction. - if (!Op.isReg() || Op.isUndef()) - continue; - - const Register R = Op.getReg(); - const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R); - - // Do not compute stalls for AGPRs - if (TRI->hasAGPRs(RC)) - continue; - if ((Mode != RM_BOTH) && !(Mode & (TRI->hasVGPRs(RC) ? RM_VGPR : RM_SGPR))) - continue; - - // Do not compute stalls if sub-register covers all banks - if (Op.getSubReg()) { - LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); - if (TRI->hasVGPRs(RC)) { - if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) - continue; - } else { - if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) - continue; - } - } - - unsigned ShiftedBank = Bank; - - if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) { - unsigned RegOffset = - TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0); - unsigned Offset = TRI->getChannelFromSubReg( - Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0); - if (Bank < NUM_VGPR_BANKS) { - unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset); - ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS; - } else if (Bank >= SGPR_BANK_OFFSET) { - unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1); - ShiftedBank = SGPR_BANK_OFFSET + - (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS; - } - } - - uint32_t Mask = getRegBankMask(R, Op.getSubReg(), - (Reg == R) ? ShiftedBank : -1); - StallCycles += countPopulation(UsedBanks & Mask); - UsedBanks |= Mask; - OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); - } - - return std::make_pair(StallCycles, UsedBanks); -} - -unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, - Register Reg1, - Register Reg2, - unsigned StallCycles) const -{ - unsigned Defs = 0; - MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); - MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); - for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { - if (MI.isDebugInstr()) - continue; - --Def; - if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) - continue; - if (Def->modifiesRegister(Reg1, TRI)) - Defs |= 1; - if (Def->modifiesRegister(Reg2, TRI)) - Defs |= 2; - } - return countPopulation(Defs); -} - -bool GCNRegBankReassign::isReassignable(Register Reg) const { - if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) - return false; - - // InlineSpiller does not call LRM::assign() after an LI split leaving it - // in an inconsistent state, so we cannot call LRM::unassign(). - // See llvm bug #48911. - // Skip reassign if a register has originated from such split. - // FIXME: Remove the workaround when bug #48911 is fixed. - if (VRM->getPreSplitReg(Reg)) - return false; - - const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); - - Register PhysReg = VRM->getPhys(Reg); - - if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) - return false; - - for (auto U : MRI->use_nodbg_operands(Reg)) { - if (U.isImplicit()) - return false; - const MachineInstr *UseInst = U.getParent(); - if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) - return false; - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); - unsigned Size = TRI->getRegSizeInBits(*RC); - - // TODO: Support 16 bit registers. Those needs to be moved with their - // parent VGPR_32 and potentially a sibling 16 bit sub-register. - if (Size < 32) - return false; - - if (TRI->hasVGPRs(RC)) - return true; - - if (Size == 16) - return AMDGPU::SGPR_LO16RegClass.contains(PhysReg); - - if (Size > 32) - PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); - - return AMDGPU::SGPR_32RegClass.contains(PhysReg); -} - -unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, - unsigned UsedBanks) const { - unsigned Size = countPopulation(Mask); - unsigned FreeBanks = 0; - unsigned Bank = findFirstSet(Mask); - - UsedBanks &= ~Mask; - - // Find free VGPR banks - if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { - for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { - if (Bank == I) - continue; - unsigned NewMask = ((1 << Size) - 1) << I; - NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; - if (!(UsedBanks & NewMask)) - FreeBanks |= 1 << I; - } - return FreeBanks; - } - - // Find free SGPR banks - // SGPR tuples must be aligned, so step is size in banks it - // crosses. - Bank -= SGPR_BANK_OFFSET; - for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { - if (Bank == I) - continue; - unsigned NewMask = ((1 << Size) - 1) << I; - NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; - if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) - FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; - } - - return FreeBanks; -} - -unsigned GCNRegBankReassign::getFreeBanks(Register Reg, - unsigned SubReg, - unsigned Mask, - unsigned UsedBanks) const { - if (!isReassignable(Reg)) - return 0; - - unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); - - unsigned Offset = TRI->getChannelFromSubReg(SubReg); - if (Offset && (Mask & VGPR_BANK_MASK)) { - unsigned Shift = Offset; - if (Shift >= NUM_VGPR_BANKS) - return 0; - unsigned VB = FreeBanks & VGPR_BANK_MASK; - FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & - VGPR_BANK_MASK; - } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) { - unsigned Shift = Offset >> 1; - if (Shift >= NUM_SGPR_BANKS) - return 0; - unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; - FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & - SGPR_BANK_SHIFTED_MASK; - FreeBanks <<= SGPR_BANK_OFFSET; - } - - LLVM_DEBUG(if (FreeBanks) { - dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) - << " to banks: "; dumpFreeBanks(FreeBanks); - dbgs() << '\n'; }); - - return FreeBanks; -} - -void GCNRegBankReassign::collectCandidates(MachineInstr& MI, - unsigned UsedBanks, - unsigned StallCycles) { - LLVM_DEBUG(MI.dump()); - - if (!StallCycles) - return; - - LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); - - for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { - for (unsigned J = I + 1; J != E; ++J) { - if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) - continue; - - Register Reg1 = OperandMasks[I].Reg; - Register Reg2 = OperandMasks[J].Reg; - unsigned SubReg1 = OperandMasks[I].SubReg; - unsigned SubReg2 = OperandMasks[J].SubReg; - unsigned Mask1 = OperandMasks[I].Mask; - unsigned Mask2 = OperandMasks[J].Mask; - unsigned Size1 = countPopulation(Mask1); - unsigned Size2 = countPopulation(Mask2); - - LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << - " and " << printReg(Reg2, SubReg2) << '\n'); - - unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); - Weight += MLI->getLoopDepth(MI.getParent()) * 10; - - LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); - - unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); - unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); - if (FreeBanks1) - Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0), - Candidate(&MI, Reg1, SubReg1, FreeBanks1)); - if (FreeBanks2) - Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0), - Candidate(&MI, Reg2, SubReg2, FreeBanks2)); - } - } -} - -unsigned GCNRegBankReassign::computeStallCycles(Register SrcReg, Register Reg, - unsigned SubReg, int Bank, - bool Collect) { - unsigned TotalStallCycles = 0; - SmallSet Visited; - - for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { - if (MI.isBundle()) - continue; - if (!Visited.insert(&MI).second) - continue; - unsigned StallCycles; - unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank); - TotalStallCycles += StallCycles; - if (Collect) - collectCandidates(MI, UsedBanks, StallCycles); - } - - return TotalStallCycles; -} - -MCRegister GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, - unsigned SubReg) const { - const TargetRegisterClass *RC = MRI->getRegClass(LI.reg()); - unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs - : MaxNumSGPRs; - unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 - : AMDGPU::SGPR0); - - for (MCRegister Reg : RC->getRegisters()) { - // Check occupancy limit. - if (TRI->isSubRegisterEq(Reg, MaxReg)) - break; - - if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank) - continue; - - for (unsigned I = 0; CSRegs[I]; ++I) - if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && - !LRM->isPhysRegUsed(CSRegs[I])) - return MCRegister::from(AMDGPU::NoRegister); - - LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); - - if (!LRM->checkInterference(LI, Reg)) - return Reg; - } - - return MCRegister::from(AMDGPU::NoRegister); -} - -unsigned GCNRegBankReassign::tryReassign(Candidate &C) { - if (!LIS->hasInterval(C.Reg)) - return 0; - - LiveInterval &LI = LIS->getInterval(C.Reg); - LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); - LI.dump()); - - // For each candidate bank walk all instructions in the range of live - // interval and check if replacing the register with one belonging to - // the candidate bank reduces conflicts. - - unsigned OrigStalls = computeStallCycles(C.Reg); - LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); - if (!OrigStalls) - return 0; - - struct BankStall { - BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; - bool operator<(const BankStall &RHS) const { - if (Stalls == RHS.Stalls) - return Bank < RHS.Bank; - return Stalls > RHS.Stalls; - } - unsigned Bank; - unsigned Stalls; - }; - SmallVector BankStalls; - - for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { - if (C.FreeBanks & (1 << Bank)) { - LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); - unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank); - if (Stalls < OrigStalls) { - LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " - << Stalls << '\n'); - BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); - } - } - } - llvm::sort(BankStalls); - - MCRegister OrigReg = VRM->getPhys(C.Reg); - LRM->unassign(LI); - while (!BankStalls.empty()) { - BankStall BS = BankStalls.pop_back_val(); - MCRegister Reg = scavengeReg(LI, BS.Bank, C.SubReg); - if (Reg == AMDGPU::NoRegister) { - LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) - << '\n'); - continue; - } - LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) - << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") - << " in bank " << printBank(BS.Bank) << '\n'); - - LRM->assign(LI, Reg); - - LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); - - return OrigStalls - BS.Stalls; - } - LRM->assign(LI, OrigReg); - - return 0; -} - -unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, - bool Collect) { - unsigned TotalStallCycles = 0; - - for (MachineBasicBlock &MBB : MF) { - - LLVM_DEBUG(if (Collect) { - if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); - else dbgs() << MBB.getName(); dbgs() << ":\n"; - }); - - for (MachineInstr &MI : MBB.instrs()) { - if (MI.isBundle()) - continue; // we analyze the instructions inside the bundle individually - - unsigned StallCycles; - unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI); - - if (Collect) - collectCandidates(MI, UsedBanks, StallCycles); - - TotalStallCycles += StallCycles; - } - - LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); - } - - return TotalStallCycles; -} - -void GCNRegBankReassign::removeCandidates(Register Reg) { - typename CandidateList::iterator Next; - for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) { - Next = std::next(I); - I->second.remove_if([Reg, this](const Candidate& C) { - return C.MI->readsRegister(Reg, TRI); - }); - if (I->second.empty()) - Candidates.erase(I); - } -} - -bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, - unsigned OriginalCycles, - unsigned CyclesSaved) { - unsigned StallCycles = collectCandidates(MF, false); - LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles - << " stall cycles left\n"); - return StallCycles + CyclesSaved == OriginalCycles; -} - -bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { - ST = &MF.getSubtarget(); - if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) - return false; - - MRI = &MF.getRegInfo(); - - LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " - << MF.getName() << '\n' - << ((Mode & RM_VGPR) ? "VGPR " : "") - << ((Mode & RM_SGPR) ? "SGPR " : "") << "mode\n" - << "NumVirtRegs = " << MRI->getNumVirtRegs() << "\n\n"); - - if (MRI->getNumVirtRegs() > VRegThresh) { - LLVM_DEBUG(dbgs() << "NumVirtRegs > " << VRegThresh - << " threshold, skipping function.\n\n"); - return false; - } - - TRI = ST->getRegisterInfo(); - MLI = &getAnalysis(); - VRM = &getAnalysis(); - LRM = &getAnalysis(); - LIS = &getAnalysis(); - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned Occupancy = MFI->getOccupancy(); - MaxNumVGPRs = ST->getMaxNumVGPRs(MF); - MaxNumSGPRs = ST->getMaxNumSGPRs(MF); - MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); - MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); - - CSRegs = MRI->getCalleeSavedRegs(); - unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() + - // Not a tight bound - AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1; - RegsUsed.resize(NumRegBanks); - - unsigned StallCycles = collectCandidates(MF); - NumStallsDetected += StallCycles; - - LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " - "function " << MF.getName() << '\n'); - - LLVM_DEBUG(Candidates.dump(this)); - - unsigned CyclesSaved = 0; - while (!Candidates.empty()) { - Candidate C = Candidates.back(); - unsigned LocalCyclesSaved = tryReassign(C); - CyclesSaved += LocalCyclesSaved; - - if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) - report_fatal_error("RegBank reassign stall cycles verification failed."); - - Candidates.pop_back(); - if (LocalCyclesSaved) { - removeCandidates(C.Reg); - computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); - - LLVM_DEBUG(Candidates.dump(this)); - } - } - NumStallsRecovered += CyclesSaved; - - LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved - << " cycles saved in function " << MF.getName() << '\n'); - - Candidates.clear(); - - if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) - report_fatal_error("RegBank reassign stall cycles verification failed."); - - RegsUsed.clear(); - - return CyclesSaved > 0; -} - -MachineFunctionPass * -llvm::createGCNRegBankReassignPass(RegBankReassignMode Mode) { - return new GCNRegBankReassign(Mode); -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index c3e31672c852..a178f055ac06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1643,12 +1643,8 @@ define <2 x i64> @v_ashr_v2i64(<2 x i64> %value, <2 x i64> %amount) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[10:11] -; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[7:8] +; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] +; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i64> %value, %amount ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 80b599f441a2..57410918e0c2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -314,45 +314,45 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr, ; GFX10-NEXT: v_add_nc_u32_e32 v19, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v19 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cndmask_b32_e32 v18, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v19 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v27, v18, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v8, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v8, vcc_lo ; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v27, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v3, v9, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v19 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v11, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v22, v11, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v3, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v14, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v19 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v16, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc_lo @@ -577,54 +577,54 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace( ; ; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_cndmask_b32_e32 v4, s8, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, s9, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, s8, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s20, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s21, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s20, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s21, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s22, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s22, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s23, s0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll index 525b2c2ec45a..3a88af6fb5dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -581,9 +581,9 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index 50def72b7425..c820562bf9f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -223,9 +223,9 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %i ; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_and_b32_sdwa v4, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v6, v0, s4, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v4, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1036,12 +1036,12 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p ; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v3 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: s_and_b32 s0, s2, 3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -2613,25 +2613,25 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX10-NEXT: v_and_b32_sdwa v13, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_b32_sdwa v14, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v23, v1, s1, v8 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v8 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v17, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_and_or_b32 v2, v2, s1, v19 +; GFX10-NEXT: v_and_or_b32 v2, v2, s1, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v13, v7 -; GFX10-NEXT: v_or3_b32 v1, v23, v14, v9 +; GFX10-NEXT: v_or3_b32 v1, v1, v14, v9 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_and_or_b32 v5, v3, v4, v5 ; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v2, v2, v17, v11 +; GFX10-NEXT: v_or3_b32 v2, v2, v15, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 647d22b68fc9..a944adb4375e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -186,9 +186,9 @@ define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 @@ -227,9 +227,9 @@ define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 @@ -346,20 +346,20 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) { ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: s_mov_b64 s[4:5], 1 ; GFX10-NEXT: s_mov_b64 s[8:9], 3 -; GFX10-NEXT: s_mov_b64 s[14:15], 4 +; GFX10-NEXT: s_mov_b64 s[10:11], 4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, s5, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 ; GFX10-NEXT: s_mov_b64 s[12:13], 5 +; GFX10-NEXT: s_mov_b64 s[14:15], 6 ; GFX10-NEXT: s_mov_b64 s[16:17], 7 ; GFX10-NEXT: s_mov_b64 s[18:19], 8 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: s_mov_b64 s[14:15], 6 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 @@ -561,11 +561,11 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) { ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s46, s12 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 @@ -576,8 +576,8 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo @@ -624,23 +624,23 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo @@ -860,9 +860,9 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 @@ -1360,23 +1360,23 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo @@ -1416,9 +1416,9 @@ define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 @@ -1530,23 +1530,23 @@ define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 % ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo @@ -2001,9 +2001,9 @@ define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2034,9 +2034,9 @@ define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel) ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: ; return to shader part epilog @@ -2162,9 +2162,9 @@ define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v7 @@ -2199,9 +2199,9 @@ define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel) ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 @@ -2311,19 +2311,19 @@ define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, s12 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s11, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -2358,17 +2358,17 @@ define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo @@ -2520,11 +2520,11 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s46, s12 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo @@ -2533,8 +2533,8 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo @@ -2575,23 +2575,23 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v15, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x double> %vec, i32 %sel @@ -3168,8 +3168,8 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s46, s12 -; GFX10-NEXT: s_mov_b32 s47, s13 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 @@ -3187,9 +3187,9 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s46, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s47, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 @@ -3245,25 +3245,25 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 @@ -3476,25 +3476,25 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) { ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index b399aad2faf0..de48249ae006 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -637,9 +637,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 @@ -849,9 +849,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 @@ -1515,9 +1515,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v7, v4 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v7, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 6431eabf459a..aa6a244a1254 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -712,27 +712,27 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 ; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_v2f32: @@ -752,18 +752,18 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH-NEXT: s_denorm_mode 0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v11, vcc_lo, v1, v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v11, v6 -; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v11 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v2 ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6 -; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v11, -v4, v5 +; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v2, -v4, v5 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v11, v6, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b @@ -874,27 +874,27 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 ; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_ulp25: @@ -905,16 +905,16 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv @@ -1044,25 +1044,25 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 ; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 ; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -1226,25 +1226,25 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v13, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v8, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 ; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v8, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v8, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v13, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v13 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 ; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v13, -v2, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v13, v4, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v8, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -1465,27 +1465,27 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v15, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v10, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v10, 1.0 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v10, v9, v10 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v15, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v15 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 ; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v10 -; GFX10-IEEE-NEXT: v_fmac_f32_e64 v15, -v4, v9 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9 ; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v7, v15, v6, v9 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9 ; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v6, v8, v10, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v7, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v6, v3, v1 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: @@ -1496,16 +1496,16 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 ; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v7, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v6, 1.0, s5, s4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v7, v0 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v6, v1 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll index 946f54d1178b..ca836897baa4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -105,10 +105,10 @@ define double @v_fdiv_f64_afn(double %a, double %b) { ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b ret double %fdiv @@ -355,9 +355,9 @@ define double @v_rcp_f64_arcp_afn(double %x) { ; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GFX10-NEXT: v_mul_f64 v[6:7], 1.0, v[2:3] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GFX10-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn double 1.0, %x ret double %fdiv @@ -458,10 +458,10 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) { ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[10:11] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b, !fpmath !0 ret double %fdiv @@ -634,33 +634,29 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b ret <2 x double> %fdiv @@ -692,30 +688,22 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v4 -; GFX10-NEXT: v_mov_b32_e32 v19, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] -; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b ret <2 x double> %fdiv @@ -816,33 +804,29 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv @@ -943,29 +927,29 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x ret <2 x double> %fdiv @@ -1066,29 +1050,29 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> , %x ret <2 x double> %fdiv @@ -1120,26 +1104,22 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[14:15] -; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[14:15], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[6:7], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[2:3], v[4:5], v[4:5] -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX10-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] ; GFX10-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] ; GFX10-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] -; GFX10-NEXT: v_fma_f64 v[14:15], -v[14:15], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[0:1], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x double> , %x ret <2 x double> %fdiv @@ -1240,29 +1220,29 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[22:23], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[20:21], s4, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[22:23] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[22:23], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[4:5], -v[20:21], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[4:5], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[6:7] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[22:23], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[20:21], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[4:5], v[6:7], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[12:13], v[10:11], v[18:19] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[2:3], 1.0 +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x, !fpmath !0 ret <2 x double> %fdiv @@ -1294,30 +1274,22 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v4 -; GFX10-NEXT: v_mov_b32_e32 v19, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] -; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv @@ -1418,33 +1390,29 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_div_scale_f64 v[26:27], s4, v[30:31], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[30:31], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[24:25], s4, v[4:5], v[4:5], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[4:5], v[2:3] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[26:27] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[24:25] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[10:11], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[26:27], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], -v[24:25], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[8:9], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[6:7] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[26:27], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[24:25], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[18:19], v[8:9], v[6:7], v[18:19] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] ; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[14:15], v[22:23] -; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[18:19], v[30:31], v[0:1] -; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[4:5], v[2:3] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv @@ -1476,30 +1444,22 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v4 -; GFX10-NEXT: v_mov_b32_e32 v19, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[18:19], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[6:7], v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[12:13], v[10:11], v[10:11] -; GFX10-NEXT: v_mul_f64 v[12:13], v[22:23], v[8:9] -; GFX10-NEXT: v_mul_f64 v[14:15], v[0:1], v[10:11] -; GFX10-NEXT: v_fma_f64 v[18:19], -v[18:19], v[12:13], v[22:23] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[0:1] -; GFX10-NEXT: v_fma_f64 v[0:1], v[18:19], v[8:9], v[12:13] -; GFX10-NEXT: v_fma_f64 v[2:3], v[4:5], v[10:11], v[14:15] +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX10-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index aad28b9b6cde..077f91302387 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -479,12 +479,8 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v12, v2 -; GFX10-NEXT: v_mov_b32_e32 v13, v3 -; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) ret <2 x double> %fma diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fdd450fea986..83ec29db8f8e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -1159,7 +1159,6 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1167,13 +1166,14 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_mov_b32_e32 v15, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 @@ -2190,13 +2190,13 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: s_sub_i32 s4, 0, 24 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v12 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2224,19 +2224,19 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v15 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v11, v6, v12 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v10, v5, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v11, v2 -; GFX10-NEXT: v_and_b32_e32 v6, v7, v12 -; GFX10-NEXT: v_and_b32_e32 v7, v15, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v6, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v7, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, v10, v3 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v7, v7, v10 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result @@ -2617,13 +2617,13 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX10-NEXT: v_alignbit_b32 v0, v7, v2, v4 -; GFX10-NEXT: v_alignbit_b32 v1, v6, v3, v5 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) ret <2 x i32> %result @@ -2770,22 +2770,22 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v22, v1, v5, 1 -; GFX10-NEXT: v_alignbit_b32 v18, v0, v4, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v15, 1, v0 +; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 +; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 +; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v19, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 -; GFX10-NEXT: v_alignbit_b32 v5, v2, v6, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v23, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10 -; GFX10-NEXT: v_alignbit_b32 v13, v3, v7, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 -; GFX10-NEXT: v_alignbit_b32 v0, v15, v18, v8 -; GFX10-NEXT: v_alignbit_b32 v1, v19, v22, v9 -; GFX10-NEXT: v_alignbit_b32 v2, v23, v5, v10 -; GFX10-NEXT: v_alignbit_b32 v3, v14, v13, v11 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) ret <4 x i32> %result @@ -4176,15 +4176,15 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v11, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v15, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v11, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v15, v2 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v19, v1 -; GFX10-NEXT: v_pk_lshrrev_b16 v3, v6, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4290,9 +4290,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v7, 63, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -4703,18 +4703,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v15, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v19, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v13, 63, v11 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v10 -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result @@ -5178,16 +5178,14 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 ; GFX10-NEXT: s_movk_i32 s4, 0x7f -; GFX10-NEXT: v_mov_b32_e32 v27, v2 ; GFX10-NEXT: v_and_b32_e32 v18, s4, v8 -; GFX10-NEXT: v_mov_b32_e32 v28, v3 ; GFX10-NEXT: v_and_b32_e32 v19, s4, v9 ; GFX10-NEXT: s_sub_i32 s4, 64, 1 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[6:7] ; GFX10-NEXT: s_sub_i32 s4, 1, 64 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s4, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[12:13], s4, v[6:7] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 ; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 @@ -5197,48 +5195,48 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: s_and_b32 s4, 1, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v14, 64, v18 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v9, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v14, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[27:28] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, v7, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v23, 64, v18 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v7, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 ; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[21:22] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[21:22] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 ; GFX10-NEXT: v_or_b32_e32 v15, v15, v17 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[21:22] -; GFX10-NEXT: v_cndmask_b32_e64 v23, v8, v14, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v31, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v27, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v28, s6 -; GFX10-NEXT: v_or_b32_e32 v0, v31, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -5473,7 +5471,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v0, s[8:9] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[8:9] @@ -5481,25 +5479,25 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[8:9] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s2, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s6, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -5756,7 +5754,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX10-NEXT: s_sub_i32 s0, 1, 64 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 @@ -5765,12 +5763,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX10-NEXT: s_sub_i32 s0, 64, s4 @@ -5778,7 +5776,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX10-NEXT: s_sub_i32 s0, s4, 64 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 @@ -5787,12 +5785,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 @@ -6025,7 +6023,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[11:12], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s12, 1, s6 ; GFX10-NEXT: s_sub_i32 s13, 1, 64 @@ -6045,10 +6043,10 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s14, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 @@ -6419,7 +6417,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_movk_i32 s4, 0x41 -; GFX10-NEXT: v_lshrrev_b32_e32 v19, 31, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v16, 31, v5 ; GFX10-NEXT: s_sub_i32 s5, 64, s4 ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1] @@ -6431,39 +6429,39 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_sub_i32 s5, 64, 63 -; GFX10-NEXT: v_or_b32_e32 v15, v9, v11 ; GFX10-NEXT: v_or_b32_e32 v14, v8, v10 +; GFX10-NEXT: v_or_b32_e32 v15, v9, v11 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[6:7] ; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s7, 1, s4 ; GFX10-NEXT: s_sub_i32 s4, 63, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], s4, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], s4, v[6:7] ; GFX10-NEXT: s_cmp_lt_u32 63, 64 -; GFX10-NEXT: v_or_b32_e32 v6, v19, v8 +; GFX10-NEXT: v_or_b32_e32 v6, v16, v8 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 63, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s7 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v23, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s4 ; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v24, v9, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v9, s4 ; GFX10-NEXT: s_and_b32 s4, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v0, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, v2, s6 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v15, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v3, s6 -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v27, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v19, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s6 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v13, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result @@ -6810,21 +6808,19 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_movk_i32 s18, 0x7f ; GFX10-NEXT: s_mov_b32 s19, 0 -; GFX10-NEXT: s_mov_b32 s30, s0 ; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX10-NEXT: s_sub_i32 s17, s22, 64 ; GFX10-NEXT: s_sub_i32 s23, 64, s22 ; GFX10-NEXT: s_cmp_lt_u32 s22, 64 -; GFX10-NEXT: s_mov_b32 s31, s1 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s22, 0 ; GFX10-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[30:31], s23 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 -; GFX10-NEXT: s_lshl_b64 s[22:23], s[30:31], s22 +; GFX10-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 ; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[30:31], s17 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] @@ -6844,7 +6840,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_cmp_lg_u32 s30, 0 -; GFX10-NEXT: s_cselect_b64 s[46:47], s[8:9], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[26:27], 0 ; GFX10-NEXT: s_sub_i32 s26, s16, 64 @@ -6853,7 +6849,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b32 s27, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b32 s30, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[46:47], s16 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 ; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 ; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] @@ -6861,7 +6857,7 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s30, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[46:47], s[8:9] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] @@ -7329,8 +7325,6 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v29, v2 -; GFX10-NEXT: v_mov_b32_e32 v30, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v23, 64, v27 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v16, vcc_lo @@ -7338,20 +7332,20 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[29:30] +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] ; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v34, v21, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v35, v22, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v21, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v10, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v11, s4 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v31, 64, v27 +; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[34:35] +; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v31, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 @@ -7363,77 +7357,77 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo -; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v34, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v35, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v8, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v9, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, v1, s4 ; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20 -; GFX10-NEXT: v_or_b32_e32 v0, v21, v8 ; GFX10-NEXT: v_or_b32_e32 v1, v11, v9 +; GFX10-NEXT: v_or_b32_e32 v0, v21, v8 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s8, v[14:15] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 -; GFX10-NEXT: v_and_b32_e32 v27, s7, v16 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v24, s7, v20 ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], s9, v[14:15] -; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_and_b32_e32 v24, s7, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v30, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v16, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v31, v17, v11, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v24 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v29, s6 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v24 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v18, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[14:15], v24, v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v9, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v31, 64, v27 -; GFX10-NEXT: v_lshrrev_b64 v[35:36], v18, v[4:5] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v27 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[18:19], v27, v[12:13] -; GFX10-NEXT: v_lshlrev_b64 v[20:21], v31, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v25, v[12:13] +; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v5, v36, v15 -; GFX10-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX10-NEXT: v_or_b32_e32 v5, v11, v15 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9] +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v16, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v31, v3, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v3, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[3:4], v27, v[8:9] +; GFX10-NEXT: v_lshrrev_b64 v[3:4], v25, v[8:9] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v18, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v27 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v14, v5, v7, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v4, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v31, v6, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v6, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v10, v12, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v3, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v13, s5 ; GFX10-NEXT: v_or_b32_e32 v3, v22, v23 -; GFX10-NEXT: v_or_b32_e32 v7, v14, v11 +; GFX10-NEXT: v_or_b32_e32 v7, v7, v11 ; GFX10-NEXT: v_or_b32_e32 v4, v15, v5 -; GFX10-NEXT: v_or_b32_e32 v6, v19, v10 +; GFX10-NEXT: v_or_b32_e32 v6, v6, v10 ; GFX10-NEXT: v_or_b32_e32 v5, v9, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index d9abd3550960..35d17d88615a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -260,9 +260,9 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f ; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 ; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v7, v4, v3 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -1158,38 +1158,38 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_and_b32_e32 v15, 7, v8 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v14, 7, v11 -; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, 0xff -; GFX10-NEXT: v_lshlrev_b16 v3, v14, v3 ; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 ; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v15, 7, v14 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 ; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 ; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v15, v5 +; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5 ; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 @@ -2190,14 +2190,14 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: s_sub_i32 s4, 0, 24 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v12 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2224,18 +2224,18 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_and_b32_e32 v4, v11, v12 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 +; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 -; GFX10-NEXT: v_and_b32_e32 v11, v6, v12 -; GFX10-NEXT: v_and_b32_e32 v4, v7, v12 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v4, v7, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v11, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) @@ -4424,9 +4424,9 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 63, v5 +; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4833,18 +4833,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v19, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v15, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v11 -; GFX10-NEXT: v_and_b32_e32 v13, 63, v10 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result @@ -5317,46 +5317,44 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s6 ; GFX10-NEXT: v_and_b32_e32 v19, s5, v15 -; GFX10-NEXT: v_and_b32_e32 v20, s5, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v20, s5, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v20 -; GFX10-NEXT: v_mov_b32_e32 v25, v4 -; GFX10-NEXT: v_mov_b32_e32 v26, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v19 ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v11, v[9:10] -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v20 ; GFX10-NEXT: v_lshlrev_b64 v[13:14], v19, v[9:10] -; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v20 +; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[9:10] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[9:10] -; GFX10-NEXT: v_lshrrev_b64 v[15:16], v20, v[25:26] +; GFX10-NEXT: v_lshrrev_b64 v[15:16], v20, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[17:18], v17, v[6:7] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19 -; GFX10-NEXT: v_or_b32_e32 v10, v3, v12 -; GFX10-NEXT: v_or_b32_e32 v11, v2, v11 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v21, v[6:7] ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v13, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v11, v2, v11 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v20 +; GFX10-NEXT: v_or_b32_e32 v10, v3, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v13, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v13, v15, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v10, v16, v18 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v20, v[6:7] ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v10, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v15, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v25, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v26, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v7, s4 -; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v9, v3 @@ -5591,31 +5589,31 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v12 ; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[10:11] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[10:11] ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v13, s[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v12, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s8, s2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s8, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX10-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -5870,7 +5868,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] ; GFX10-NEXT: s_sub_i32 s0, s8, 64 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 @@ -5879,12 +5877,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] -; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX10-NEXT: v_or_b32_e32 v0, s4, v0 @@ -6128,10 +6126,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_sub_i32 s5, 1, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshlrev_b64 v[13:14], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 ; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v6, v4, v6 @@ -6139,8 +6137,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: s_and_b32 s5, 1, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v13, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v14, v5, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc_lo ; GFX10-NEXT: s_sub_i32 s5, s6, 64 @@ -6148,7 +6146,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 ; GFX10-NEXT: s_sub_i32 s4, 64, s6 ; GFX10-NEXT: s_cmp_lt_u32 s6, 64 -; GFX10-NEXT: v_lshrrev_b64 v[11:12], s4, v[4:5] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[0:1] ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 @@ -6156,8 +6154,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[4:5], s5, v[4:5] -; GFX10-NEXT: v_or_b32_e32 v2, v11, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v12, v7 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 ; GFX10-NEXT: s_sub_i32 s10, s8, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo @@ -6538,22 +6536,22 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: s_cmp_eq_u32 63, 0 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], s4, v[0:1] ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[14:15], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 31, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v11, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: s_movk_i32 s6, 0x41 ; GFX10-NEXT: s_and_b32 s4, 1, s4 ; GFX10-NEXT: s_sub_i32 s5, 64, s6 ; GFX10-NEXT: v_or_b32_e32 v12, v9, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v14, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v8, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[10:11], s5, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[8:9], s6, v[4:5] ; GFX10-NEXT: s_sub_i32 s5, s6, 64 ; GFX10-NEXT: s_cmp_lt_u32 s6, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s5, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v12, v1, v12, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[6:7] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 @@ -6563,17 +6561,17 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { ; GFX10-NEXT: s_and_b32 s5, 1, s5 ; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v7, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX10-NEXT: v_or_b32_e32 v1, v13, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6921,10 +6919,8 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX10-NEXT: s_sub_i32 s31, 64, 1 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: s_mov_b32 s62, s10 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 -; GFX10-NEXT: s_mov_b32 s63, s11 ; GFX10-NEXT: s_cselect_b32 s23, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s31 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], 1 @@ -6935,23 +6931,23 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cselect_b64 s[26:27], s[28:29], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10-NEXT: s_cselect_b64 s[46:47], s[2:3], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX10-NEXT: s_sub_i32 s23, s16, 64 ; GFX10-NEXT: s_sub_i32 s2, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[46:47], s16 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[0:1], s16 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[26:27], s2 ; GFX10-NEXT: s_lshl_b64 s[16:17], s[26:27], s16 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] ; GFX10-NEXT: s_lshl_b64 s[24:25], s[26:27], s23 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[78:79], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[24:25] ; GFX10-NEXT: s_cmp_lg_u32 s29, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[46:47], s[2:3] +; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] ; GFX10-NEXT: s_sub_i32 s26, s22, 64 ; GFX10-NEXT: s_sub_i32 s23, 64, s22 ; GFX10-NEXT: s_cmp_lt_u32 s22, 64 @@ -6959,17 +6955,17 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX10-NEXT: s_cmp_eq_u32 s22, 0 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[62:63], s23 -; GFX10-NEXT: s_lshr_b64 s[22:23], s[62:63], s22 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s23 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[10:11], s22 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[62:63], s26 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[78:79], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] @@ -7413,7 +7409,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: s_sub_i32 s5, 64, 1 ; GFX10-NEXT: s_sub_i32 s6, 1, 64 ; GFX10-NEXT: s_cmp_lt_u32 1, 64 -; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[17:18], s5, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[21:22], 1, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 1, 0 @@ -7421,117 +7417,115 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v16 -; GFX10-NEXT: v_or_b32_e32 v21, v27, v21 -; GFX10-NEXT: v_or_b32_e32 v18, v28, v22 +; GFX10-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX10-NEXT: v_xor_b32_e32 v19, -1, v16 ; GFX10-NEXT: s_movk_i32 s7, 0x7f ; GFX10-NEXT: s_and_b32 s8, 1, s8 -; GFX10-NEXT: v_and_b32_e32 v31, s7, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v18, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v21, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v18, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v23, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v24, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v19, 64, v31 ; GFX10-NEXT: v_and_b32_e32 v26, s7, v16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v31 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v19, v[17:18] -; GFX10-NEXT: v_mov_b32_e32 v35, v10 -; GFX10-NEXT: v_mov_b32_e32 v36, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v26 -; GFX10-NEXT: v_lshlrev_b64 v[21:22], v31, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[23:24], v31, v[17:18] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v31 -; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v26 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v25 +; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v25 +; GFX10-NEXT: v_sub_nc_u32_e32 v19, 64, v26 +; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[17:18] +; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[17:18] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 +; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26 -; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[4:5] +; GFX10-NEXT: s_cmp_lt_u32 1, 64 ; GFX10-NEXT: v_or_b32_e32 v21, v2, v21 ; GFX10-NEXT: v_or_b32_e32 v22, v3, v22 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v16, v[17:18] -; GFX10-NEXT: v_lshlrev_b64 v[18:19], v25, v[35:36] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v19, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v39, 0, v24, vcc_lo -; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v21, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v22, vcc_lo -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v29, v[35:36] -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v27, v[10:11] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 ; GFX10-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX10-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7] -; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_xor_b32_e32 v25, -1, v20 ; GFX10-NEXT: v_cndmask_b32_e32 v18, v21, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v31, v22, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v19, v3, v17, s4 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], 1, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[4:5], s6, v[4:5] -; GFX10-NEXT: s_and_b32 s6, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v8, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v25, -1, v20 -; GFX10-NEXT: v_or_b32_e32 v2, v27, v10 -; GFX10-NEXT: v_or_b32_e32 v3, v28, v11 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s5, v[4:5] +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], s6, v[4:5] +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: s_and_b32 s6, 1, s5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX10-NEXT: s_and_b32 s8, 1, s8 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v26, v[35:36] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v16, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v19, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v11, v4, v2, s6 -; GFX10-NEXT: v_and_b32_e32 v30, s7, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v2, s6 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v3, s6 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v17, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, v0, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v30 ; GFX10-NEXT: v_or_b32_e32 v0, v23, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 -; GFX10-NEXT: v_lshrrev_b64 v[5:6], v2, v[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 64, v30 -; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v1, s4 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v30, v[3:4] +; GFX10-NEXT: v_cndmask_b32_e32 v3, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v25 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 64, v25 ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX10-NEXT: v_or_b32_e32 v1, v39, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v1, s4 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[3:4] +; GFX10-NEXT: v_lshrrev_b64 v[5:6], v2, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v1, v24, v16 ; GFX10-NEXT: v_or_b32_e32 v2, v18, v19 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v30, v[8:9] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v25, v[8:9] ; GFX10-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] +; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX10-NEXT: v_or_b32_e32 v10, v5, v10 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 64, v23 -; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v30 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, v[8:9] ; GFX10-NEXT: v_or_b32_e32 v9, v6, v11 -; GFX10-NEXT: v_lshrrev_b64 v[34:35], v5, v[14:15] ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v23 +; GFX10-NEXT: v_lshrrev_b64 v[5:6], v5, v[14:15] ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v7, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[7:8], v23, v[14:15] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v30 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v34, v16, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v23 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v35, v18, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v18, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v3, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v4, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v12, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v13, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v7, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s4 -; GFX10-NEXT: v_or_b32_e32 v3, v31, v26 +; GFX10-NEXT: v_or_b32_e32 v3, v22, v26 ; GFX10-NEXT: v_or_b32_e32 v4, v11, v4 ; GFX10-NEXT: v_or_b32_e32 v5, v14, v5 -; GFX10-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v6, v10, v6 ; GFX10-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 9502d23b4f8f..b4b0037ab677 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2235,8 +2235,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v11, v6, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 ; GFX10-NEXT: v_and_or_b32 v7, v2, v7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 @@ -2482,8 +2482,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v3, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo @@ -2902,21 +2902,21 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: s_and_b32 s9, s2, s8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7 ; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_lshl_b32 s3, s3, 4 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_lshl_b32 s8, s8, s3 ; GFX10-NEXT: s_lshl_b32 s3, s9, s3 ; GFX10-NEXT: s_not_b32 s8, s8 +; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v11, v2, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s6 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 ; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0 @@ -3822,19 +3822,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v5, s0 ; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 ; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -4020,16 +4020,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: s_lshl_b32 s7, s8, s7 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 ; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v11, v0, v7, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5 ; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1 @@ -4201,6 +4201,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 @@ -4220,9 +4221,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v8, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v9, s3 -; GFX10-NEXT: v_mov_b32_e32 v15, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 ; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 7ac27aff1eb9..adf7a49ae0c7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -1638,11 +1638,11 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_and_b32_sdwa v4, v0, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v6, v0, s0, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_lshl_b32 s1, s0, s1 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: v_or3_b32 v0, v6, v4, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v4, v3 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 @@ -1794,9 +1794,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v0, s1, v3 -; GFX10-NEXT: v_or3_b32 v0, v3, v6, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 @@ -1804,10 +1804,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v0, v3, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10-NEXT: v_or3_b32 v2, v0, v2, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v3, v2, v4 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -2324,13 +2324,13 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v0, s1, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s1, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v3 ; GFX10-NEXT: s_lshr_b32 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s3, s3, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 ; GFX10-NEXT: s_lshl_b32 s4, s1, s3 @@ -2629,12 +2629,12 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -2905,20 +2905,20 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -3186,20 +3186,20 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_b32_sdwa v6, v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v11, v0, s2, v2 -; GFX10-NEXT: v_and_or_b32 v2, v1, s2, v3 -; GFX10-NEXT: v_or3_b32 v0, v11, v6, v4 -; GFX10-NEXT: v_or3_b32 v1, v2, v7, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -3397,7 +3397,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; ; GFX10-LABEL: insertelement_v_v8i8_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx2 v[11:12], v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 ; GFX10-NEXT: s_movk_i32 s1, 0xff @@ -3405,22 +3405,22 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v12 -; GFX10-NEXT: v_and_b32_sdwa v8, v11, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_and_b32_sdwa v8, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v9, v12, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v9, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v11, v11, s1, v4 -; GFX10-NEXT: v_and_or_b32 v10, v12, s1, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v4 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v5 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s1 ; GFX10-NEXT: s_and_b32 s0, s2, s1 -; GFX10-NEXT: v_or3_b32 v0, v11, v8, v6 -; GFX10-NEXT: v_or3_b32 v1, v10, v9, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v8, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v9, v7 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 @@ -3906,34 +3906,34 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v15, v0, s1, v6 -; GFX10-NEXT: v_and_or_b32 v14, v1, s1, v7 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, s1, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, v4, v5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_or3_b32 v0, v15, v10, v8 -; GFX10-NEXT: v_or3_b32 v1, v14, v11, v9 +; GFX10-NEXT: v_or3_b32 v0, v0, v10, v8 +; GFX10-NEXT: v_or3_b32 v1, v1, v11, v9 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v7, v4, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v6, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v4, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_b32_sdwa v4, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v3, v0, v5, v3 -; GFX10-NEXT: v_and_or_b32 v1, v11, v5, v2 -; GFX10-NEXT: v_or3_b32 v0, v3, v8, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, v5, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or3_b32 v1, v1, v4, v7 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_or3_b32 v0, v0, v8, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v4, v7 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -4820,60 +4820,60 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 ; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_or3_b32 v7, v1, v14, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v1, v1, v14, v8 ; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v12 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc_lo +; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 ; GFX10-NEXT: v_or3_b32 v3, v3, v16, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s1 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v13, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v5, v2, s4, v9 ; GFX10-NEXT: v_and_b32_sdwa v14, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v16, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_and_or_b32 v18, v3, s4, v4 -; GFX10-NEXT: v_or3_b32 v2, v5, v15, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v0, v0, v13, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v14, v8 -; GFX10-NEXT: v_or3_b32 v3, v18, v16, v11 +; GFX10-NEXT: v_or3_b32 v3, v3, v16, v11 +; GFX10-NEXT: v_or3_b32 v2, v2, v15, v10 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -5323,12 +5323,11 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 -; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo @@ -5337,7 +5336,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5347,18 +5346,19 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4 +; GFX10-NEXT: v_and_or_b32 v4, v0, s5, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 -; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 -; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 -; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -5814,16 +5814,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 @@ -5831,23 +5831,23 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX10-NEXT: v_and_or_b32 v19, v15, s5, v4 -; GFX10-NEXT: v_and_b32_sdwa v0, v15, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v4, v0, s5, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v6, v1, s5, v6 +; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 +; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v8, v2, s5, v8 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v9, v3, s5, v9 -; GFX10-NEXT: v_and_b32_sdwa v14, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 -; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 -; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -6300,16 +6300,16 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v2 @@ -6317,23 +6317,23 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX10-NEXT: v_and_or_b32 v19, v15, s8, v4 -; GFX10-NEXT: v_and_b32_sdwa v0, v15, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v4, v0, s8, v4 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v6, v1, s8, v6 +; GFX10-NEXT: v_and_or_b32 v8, v2, s8, v8 +; GFX10-NEXT: v_and_or_b32 v9, v3, s8, v9 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v8, v2, s8, v8 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v9, v3, s8, v9 -; GFX10-NEXT: v_and_b32_sdwa v14, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v3, v3, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_or3_b32 v0, v19, v0, v5 +; GFX10-NEXT: v_or3_b32 v0, v4, v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v1, v6, v1, v7 -; GFX10-NEXT: v_or3_b32 v2, v8, v15, v10 -; GFX10-NEXT: v_or3_b32 v3, v9, v14, v11 +; GFX10-NEXT: v_or3_b32 v2, v8, v2, v10 +; GFX10-NEXT: v_or3_b32 v3, v9, v3, v11 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -6659,7 +6659,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v16i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v22, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 @@ -6669,76 +6669,76 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v26, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v19, v4, s3, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v15, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v16, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v26, v3, s3, v26 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v17, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GFX10-NEXT: v_or3_b32 v3, v3, v15, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v12 -; GFX10-NEXT: v_and_or_b32 v30, v5, s3, v11 -; GFX10-NEXT: v_or3_b32 v3, v26, v15, v8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_or3_b32 v26, v19, v16, v10 +; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v11 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10 ; GFX10-NEXT: v_and_b32_sdwa v18, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v5, v30, v17, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v14 -; GFX10-NEXT: v_and_or_b32 v11, v6, s3, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v26, vcc_lo +; GFX10-NEXT: v_or3_b32 v5, v5, v17, v7 +; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 ; GFX10-NEXT: v_lshlrev_b32_e64 v9, v0, s3 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_or3_b32 v6, v11, v18, v8 +; GFX10-NEXT: v_or3_b32 v6, v6, v18, v8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v5, s0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v9 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1 -; GFX10-NEXT: v_and_or_b32 v0, v7, v10, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v18, v26, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 +; GFX10-NEXT: v_and_or_b32 v0, v7, v8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v0, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v0, s1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v18 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v22, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v19, v2, s3, v5 -; GFX10-NEXT: v_and_b32_sdwa v14, v18, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v5 ; GFX10-NEXT: v_and_b32_sdwa v16, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_and_or_b32 v3, v18, s3, v7 -; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v1 ; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 -; GFX10-NEXT: v_and_b32_sdwa v13, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_or3_b32 v0, v2, v13, v6 ; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 ; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -7063,7 +7063,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v16i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v18, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_lshr_b32 s4, s2, 2 @@ -7079,69 +7079,69 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v6 ; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v22, v4, s3, v19 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v3, v4, s3, v8 +; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v23, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v16, v5, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v14, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v11 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_or3_b32 v3, v22, v15, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or3_b32 v3, v3, v15, v9 ; GFX10-NEXT: v_and_b32_sdwa v17, v6, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v13 -; GFX10-NEXT: v_or3_b32 v4, v5, v23, v4 +; GFX10-NEXT: v_or3_b32 v4, v5, v16, v4 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 -; GFX10-NEXT: v_or3_b32 v7, v6, v17, v7 +; GFX10-NEXT: v_or3_b32 v6, v6, v17, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1 ; GFX10-NEXT: v_and_or_b32 v2, v5, s2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v1, v2, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v2, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v19 -; GFX10-NEXT: v_and_b32_sdwa v13, v19, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v13, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v14, v3, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v15, v4, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v19, v19, s3, v5 -; GFX10-NEXT: v_and_b32_sdwa v14, v22, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5 ; GFX10-NEXT: v_and_b32_sdwa v16, v2, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v3, v22, s3, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 -; GFX10-NEXT: v_and_or_b32 v5, v2, s3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 +; GFX10-NEXT: v_and_or_b32 v5, v2, s3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v7 +; GFX10-NEXT: v_or3_b32 v0, v1, v13, v6 ; GFX10-NEXT: v_or3_b32 v2, v4, v15, v10 +; GFX10-NEXT: v_or3_b32 v1, v3, v14, v8 ; GFX10-NEXT: v_or3_b32 v3, v5, v16, v11 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v0, v19, v13, v6 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -7489,66 +7489,66 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_and_b32_sdwa v18, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v6 ; GFX10-NEXT: v_and_or_b32 v4, v4, s1, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 8, v7 ; GFX10-NEXT: v_and_or_b32 v5, v5, s1, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX10-NEXT: v_lshrrev_b32_e32 v15, 8, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_sdwa v19, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v14 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 24, v7 +; GFX10-NEXT: v_or3_b32 v4, v4, v17, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v14 ; GFX10-NEXT: v_and_or_b32 v6, v6, s1, v13 -; GFX10-NEXT: v_or3_b32 v15, v4, v17, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_or3_b32 v5, v5, v18, v12 ; GFX10-NEXT: v_and_b32_sdwa v20, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v16 -; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v14 ; GFX10-NEXT: v_or3_b32 v6, v6, v19, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v15, v5, vcc_lo +; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, v0, v1 -; GFX10-NEXT: v_or3_b32 v7, v7, v20, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 +; GFX10-NEXT: v_or3_b32 v7, v7, v20, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v6, s0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v7, s1 ; GFX10-NEXT: v_and_or_b32 v0, v9, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v7, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v15, v0, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v18, v5, v0, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v19, 8, v27 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v18 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v27 -; GFX10-NEXT: v_lshlrev_b32_sdwa v23, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_sdwa v21, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v15, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v19, v2, v1, v23 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v15, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v16, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v10, v4, v1, v10 -; GFX10-NEXT: v_and_b32_sdwa v17, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v5 +; GFX10-NEXT: v_and_b32_sdwa v17, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v5, v0, v1, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v13 -; GFX10-NEXT: v_and_or_b32 v3, v27, v1, v8 -; GFX10-NEXT: v_and_or_b32 v2, v18, v1, v7 +; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v7 +; GFX10-NEXT: v_and_or_b32 v4, v4, v1, v10 +; GFX10-NEXT: v_or3_b32 v0, v2, v14, v6 +; GFX10-NEXT: v_or3_b32 v1, v3, v15, v9 +; GFX10-NEXT: v_or3_b32 v2, v4, v16, v11 +; GFX10-NEXT: v_or3_b32 v3, v5, v17, v12 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v0, v19, v21, v6 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v3, v3, v17, v12 -; GFX10-NEXT: v_or3_b32 v1, v2, v15, v9 -; GFX10-NEXT: v_or3_b32 v2, v10, v16, v11 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index df7299573590..b5ccf4708ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -989,8 +989,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: v_mov_b32_e32 v16, s15 ; MOVREL-NEXT: v_mov_b32_e32 v2, s1 -; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_mov_b32_e32 v15, s14 ; MOVREL-NEXT: v_mov_b32_e32 v14, s13 ; MOVREL-NEXT: v_mov_b32_e32 v13, s12 @@ -1005,30 +1005,28 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: v_mov_b32_e32 v4, s3 ; MOVREL-NEXT: v_mov_b32_e32 v3, s2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; MOVREL-NEXT: s_mov_b32 s30, s18 -; MOVREL-NEXT: s_mov_b32 s31, s19 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s31, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s31, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s30, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s31, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s30, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s31, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s30, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s31, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s30, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s31, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s30, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s31, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4 -; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s19, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s18, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s19, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s18, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s19, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s18, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s19, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s18, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s19, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s18, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s19, s4 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off @@ -1525,19 +1523,17 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 -; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 -; MOVREL-NEXT: v_mov_b32_e32 v23, v1 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18 -; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2 @@ -2161,8 +2157,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 -; MOVREL-NEXT: v_mov_b32_e32 v19, v0 -; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 @@ -2171,9 +2165,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18 -; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2 @@ -3550,28 +3544,28 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_s(<7 x float> inreg %v ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v16, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s0 +; MOVREL-NEXT: v_mov_b32_e32 v14, s7 +; MOVREL-NEXT: v_mov_b32_e32 v7, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0 -; MOVREL-NEXT: v_mov_b32_e32 v10, s1 -; MOVREL-NEXT: v_mov_b32_e32 v11, s2 -; MOVREL-NEXT: v_mov_b32_e32 v12, s3 -; MOVREL-NEXT: v_mov_b32_e32 v13, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v8, s1 +; MOVREL-NEXT: v_mov_b32_e32 v9, s2 +; MOVREL-NEXT: v_mov_b32_e32 v10, s3 +; MOVREL-NEXT: v_mov_b32_e32 v11, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1 -; MOVREL-NEXT: v_mov_b32_e32 v14, s5 -; MOVREL-NEXT: v_mov_b32_e32 v15, s6 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v10, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v12, s5 +; MOVREL-NEXT: v_mov_b32_e32 v13, s6 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 3 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 4 -; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 5 -; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 6 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc_lo ; MOVREL-NEXT: v_mov_b32_e32 v0, v7 ; MOVREL-NEXT: ; return to shader part epilog entry: @@ -3624,29 +3618,29 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_s_v_v(<7 x float> inreg %v ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v16, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s0 +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; MOVREL-NEXT: v_mov_b32_e32 v10, s1 -; MOVREL-NEXT: v_mov_b32_e32 v11, s2 -; MOVREL-NEXT: v_mov_b32_e32 v12, s3 -; MOVREL-NEXT: v_mov_b32_e32 v13, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v8, v9, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v9, s1 +; MOVREL-NEXT: v_mov_b32_e32 v10, s2 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; MOVREL-NEXT: v_mov_b32_e32 v14, s5 -; MOVREL-NEXT: v_mov_b32_e32 v15, s6 -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 ; MOVREL-NEXT: v_mov_b32_e32 v1, v7 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo ; MOVREL-NEXT: v_mov_b32_e32 v0, v8 ; MOVREL-NEXT: ; return to shader part epilog entry: @@ -4128,23 +4122,21 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v7f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v19, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 -; MOVREL-NEXT: v_mov_b32_e32 v18, v3 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 ; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 -; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0 -; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5 @@ -4271,38 +4263,38 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v20, s15 -; MOVREL-NEXT: v_mov_b32_e32 v19, s14 -; MOVREL-NEXT: v_mov_b32_e32 v18, s13 -; MOVREL-NEXT: v_mov_b32_e32 v17, s12 -; MOVREL-NEXT: v_mov_b32_e32 v16, s11 -; MOVREL-NEXT: v_mov_b32_e32 v15, s10 -; MOVREL-NEXT: v_mov_b32_e32 v14, s9 -; MOVREL-NEXT: v_mov_b32_e32 v13, s8 -; MOVREL-NEXT: v_mov_b32_e32 v12, s7 -; MOVREL-NEXT: v_mov_b32_e32 v11, s6 -; MOVREL-NEXT: v_mov_b32_e32 v10, s5 -; MOVREL-NEXT: v_mov_b32_e32 v9, s4 -; MOVREL-NEXT: v_mov_b32_e32 v8, s3 -; MOVREL-NEXT: v_mov_b32_e32 v7, s2 -; MOVREL-NEXT: v_mov_b32_e32 v6, s1 -; MOVREL-NEXT: v_mov_b32_e32 v5, s0 +; MOVREL-NEXT: v_mov_b32_e32 v17, s15 +; MOVREL-NEXT: v_mov_b32_e32 v16, s14 +; MOVREL-NEXT: v_mov_b32_e32 v15, s13 +; MOVREL-NEXT: v_mov_b32_e32 v14, s12 +; MOVREL-NEXT: v_mov_b32_e32 v13, s11 +; MOVREL-NEXT: v_mov_b32_e32 v12, s10 +; MOVREL-NEXT: v_mov_b32_e32 v11, s9 +; MOVREL-NEXT: v_mov_b32_e32 v10, s8 +; MOVREL-NEXT: v_mov_b32_e32 v9, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s6 +; MOVREL-NEXT: v_mov_b32_e32 v7, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, s4 +; MOVREL-NEXT: v_mov_b32_e32 v5, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo ; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v10, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v11, v1, s1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v3 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 @@ -4466,15 +4458,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v2 -; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo @@ -4531,15 +4521,13 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; MOVREL-NEXT: v_mov_b32_e32 v15, v2 -; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll index f3bc0466b5f5..062c0ad91ea9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1828,10 +1828,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v8, s3 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v8, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll index b438719a47ae..9c01dda2b83c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -29,7 +29,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX10NSA-LABEL: gather4_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -45,7 +45,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -83,7 +83,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10NSA-LABEL: gather4_cube: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -102,7 +102,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -140,7 +140,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10NSA-LABEL: gather4_2darray: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -159,7 +159,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -195,7 +195,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10NSA-LABEL: gather4_c_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -211,7 +211,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -249,7 +249,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10NSA-LABEL: gather4_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff @@ -268,7 +268,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -306,7 +306,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10NSA-LABEL: gather4_c_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff @@ -325,7 +325,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -361,7 +361,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10NSA-LABEL: gather4_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -377,7 +377,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -413,7 +413,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10NSA-LABEL: gather4_c_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -429,7 +429,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -467,7 +467,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10NSA-LABEL: gather4_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff @@ -486,7 +486,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff @@ -543,7 +543,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll index 5e82ab8c6ab1..f597fa920032 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -80,7 +80,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10NSA-LABEL: gather4_2d_tfe: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_mov_b32_e32 v5, v0 @@ -101,7 +101,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: v_mov_b32_e32 v2, v0 ; GFX10NSA-NEXT: v_mov_b32_e32 v3, v0 ; GFX10NSA-NEXT: v_mov_b32_e32 v4, v0 -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll index 5226382f691a..d19db8b76a68 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -65,16 +65,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: v_mov_b32_e32 v12, v11 -; GFX10-NEXT: v_mov_b32_e32 v13, v11 -; GFX10-NEXT: v_mov_b32_e32 v14, v11 -; GFX10-NEXT: v_mov_b32_e32 v15, v11 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -82,13 +82,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 -; GFX10-NEXT: v_mov_b32_e32 v4, v15 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v11, v4, s[10:11] +; GFX10-NEXT: global_store_dword v8, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -129,16 +129,16 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: v_mov_b32_e32 v12, v11 -; GFX10-NEXT: v_mov_b32_e32 v13, v11 -; GFX10-NEXT: v_mov_b32_e32 v14, v11 -; GFX10-NEXT: v_mov_b32_e32 v15, v11 -; GFX10-NEXT: v_mov_b32_e32 v0, v11 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -146,13 +146,13 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 -; GFX10-NEXT: v_mov_b32_e32 v4, v15 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v11, v4, s[10:11] +; GFX10-NEXT: global_store_dword v8, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll index d4d526b26e86..1f1b34bcd736 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -22,9 +22,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -35,14 +35,14 @@ main_body: define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v11, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v11, s12 -; GFX10-NEXT: v_and_or_b32 v2, v3, v11, v4 -; GFX10-NEXT: v_and_or_b32 v3, v5, v11, s12 +; GFX10-NEXT: v_and_or_b32 v0, v0, v9, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v9, s12 +; GFX10-NEXT: v_and_or_b32 v2, v3, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, v5, v9, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -72,9 +72,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -85,10 +85,10 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -102,10 +102,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -116,10 +116,10 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -133,9 +133,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 +; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -165,9 +165,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -196,9 +196,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -209,10 +209,10 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 +; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -226,10 +226,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -240,10 +240,10 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 +; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 +; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -257,9 +257,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 +; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v4 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -273,9 +273,9 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 +; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -289,9 +289,9 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 +; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v5 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll index 72a9dbbcb232..866bae4b3400 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -79,9 +79,9 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_or3_b32 v7, v0, v1, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 -; GFX10-NEXT: v_dot4_i32_i8 v0, v7, v1, v8 +; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 %b.cast = bitcast <4 x i8> %b to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll index 70e4021ff4ad..ffcc4ed7d38f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -79,9 +79,9 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_or3_b32 v7, v0, v1, v2 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 -; GFX10-NEXT: v_dot4_u32_u8 v0, v7, v1, v8 +; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 %b.cast = bitcast <4 x i8> %b to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index 1b8689d10a1e..23cc4fb459d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -351,8 +351,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -374,7 +374,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -383,7 +383,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB3_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -487,8 +487,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -510,7 +510,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -519,7 +519,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB4_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -632,8 +632,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32 ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] ; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry -; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll index 38634ea10e5c..939b491ff08c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -192,7 +192,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v25, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:12 ; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 ; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 @@ -213,7 +213,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) @@ -221,7 +221,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 -; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21 +; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -230,7 +230,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10 +; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll index 6dda1f4b2816..eeef6bcade9a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -158,11 +158,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 ; GFX10-NEXT: ds_read_u8 v10, v0 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v12, 0xff ; GFX10-NEXT: v_mov_b32_e32 v13, 8 ; GFX10-NEXT: s_movk_i32 s4, 0xff @@ -182,19 +182,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v8, v15, v12 +; GFX10-NEXT: v_and_b32_e32 v8, v8, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(3) ; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 ; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 @@ -266,9 +265,9 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 ; GFX10-NEXT: ds_read_u16 v2, v0 offset:6 ; GFX10-NEXT: ds_read_u16 v3, v0 offset:10 -; GFX10-NEXT: ds_read_u16 v7, v0 -; GFX10-NEXT: ds_read_u16 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u16 v15, v0 offset:8 +; GFX10-NEXT: ds_read_u16 v4, v0 +; GFX10-NEXT: ds_read_u16 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v6, v0 offset:8 ; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_and_b32_e32 v0, s4, v1 @@ -280,11 +279,11 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_and_or_b32 v0, v7, s4, v0 +; GFX10-NEXT: v_and_or_b32 v0, v4, s4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_or_b32 v1, v11, s4, v1 +; GFX10-NEXT: v_and_or_b32 v1, v5, s4, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v15, s4, v2 +; GFX10-NEXT: v_and_or_b32 v2, v6, s4, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index 73e1da080f19..0b8efd5e154d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -108,7 +108,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v25, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:12 ; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 ; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 @@ -129,7 +129,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_sdwa v21, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) @@ -137,7 +137,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 -; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v21 +; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) @@ -146,7 +146,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v10, v25, v11, v10 +; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -242,11 +242,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 ; GFX10-NEXT: ds_read_u8 v10, v0 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v12, 0xff ; GFX10-NEXT: v_mov_b32_e32 v13, 8 ; GFX10-NEXT: s_movk_i32 s4, 0xff @@ -266,19 +266,18 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v8, v15, v12 +; GFX10-NEXT: v_and_b32_e32 v8, v8, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(3) ; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 ; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v14, v12, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 @@ -410,27 +409,27 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: ds_write_b8 v0, v1 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:1 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:3 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:6 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:7 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 494593ea3554..b390c736a22c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1608,12 +1608,8 @@ define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[10:11] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[7:8] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i64> %value, %amount ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index fbf6d90e624b..dddad69df467 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -585,12 +585,12 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3 ; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7 ; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9 ; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v11, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v6 ; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i96 %num, %den @@ -997,24 +997,24 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v18, s4, v13, v11 +; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9 ; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5 -; GFX10-NEXT: v_add_co_u32 v11, s4, v18, v15 +; GFX10-NEXT: v_add_co_u32 v11, s4, v11, v15 ; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4 ; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13 ; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6 ; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v3, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX10-NEXT: v_add3_u32 v10, v10, v13, v7 +; GFX10-NEXT: v_add3_u32 v3, v3, v13, v7 ; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5 -; GFX10-NEXT: v_add3_u32 v1, v10, v15, v1 +; GFX10-NEXT: v_add3_u32 v1, v3, v15, v1 ; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2758,13 +2758,15 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17 ; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 -; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15 +; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9 +; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9 ; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20 ; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18 ; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10 ; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18 @@ -2781,7 +2783,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22 -; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5 @@ -2791,11 +2793,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8 ; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23 -; GFX10-NEXT: v_add3_u32 v18, v19, v29, v18 +; GFX10-NEXT: v_add3_u32 v18, v19, v22, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9 ; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27 -; GFX10-NEXT: v_add3_u32 v30, v21, v24, v23 +; GFX10-NEXT: v_add3_u32 v19, v21, v24, v23 ; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10 ; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26 ; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11 @@ -2813,7 +2814,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9 ; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v19, v30, v23, v20 +; GFX10-NEXT: v_add3_u32 v19, v19, v23, v20 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26 ; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 @@ -2822,120 +2823,119 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8 ; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26 ; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20 ; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27 -; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v24 -; GFX10-NEXT: v_add3_u32 v35, v23, v30, v21 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v24 +; GFX10-NEXT: v_add3_u32 v21, v23, v25, v21 ; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v34, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26 ; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19 -; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23 ; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26 ; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v20, v35, v25, v20 -; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23 -; GFX10-NEXT: v_add3_u32 v23, v34, v27, v28 +; GFX10-NEXT: v_add3_u32 v20, v21, v25, v20 +; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23 +; GFX10-NEXT: v_add3_u32 v23, v24, v27, v28 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10 +; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8 ; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9 -; GFX10-NEXT: v_add_co_u32 v27, s4, v31, v26 -; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11 +; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v26 +; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10 ; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v21, s4, v27, v29 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 +; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11 +; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v29 ; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22 ; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v33, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28 -; GFX10-NEXT: v_add_co_u32 v31, s5, v21, v26 +; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v26 ; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24 -; GFX10-NEXT: v_add_co_u32 v21, s5, v31, v29 -; GFX10-NEXT: v_add3_u32 v39, v23, v33, v26 +; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v29 +; GFX10-NEXT: v_add3_u32 v22, v23, v22, v26 ; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v35, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27 ; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9 -; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v34, s4, v24, v23 +; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20 +; GFX10-NEXT: v_add_co_u32 v23, s4, v24, v23 ; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v35, v28, v35, v29 -; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20 -; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v27 +; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 ; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX10-NEXT: v_add_co_u32 v34, s4, v23, v24 -; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v22, v35, v30, v32 -; GFX10-NEXT: v_add3_u32 v21, v39, v26, v21 -; GFX10-NEXT: v_add_co_u32 v34, s4, v34, v27 +; GFX10-NEXT: v_add3_u32 v21, v22, v26, v21 ; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v24 +; GFX10-NEXT: v_add3_u32 v24, v28, v25, v29 +; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 +; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10 +; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 +; GFX10-NEXT: v_add3_u32 v24, v24, v30, v32 ; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v31 +; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v31 +; GFX10-NEXT: v_add3_u32 v22, v24, v28, v27 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v22, v22, v28, v27 ; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25 ; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8 -; GFX10-NEXT: v_add_co_u32 v30, s4, v23, v26 -; GFX10-NEXT: v_add3_u32 v33, v22, v24, v25 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v26 +; GFX10-NEXT: v_add3_u32 v22, v22, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 ; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28 ; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v23, s4, v30, v27 ; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 ; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10 +; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14 ; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13 -; GFX10-NEXT: v_add3_u32 v33, v33, v26, v27 ; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12 -; GFX10-NEXT: v_add3_u32 v26, v7, v29, v28 ; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13 -; GFX10-NEXT: v_add3_u32 v7, v26, v24, v15 -; GFX10-NEXT: v_add_co_u32 v11, s4, v23, v25 +; GFX10-NEXT: v_add3_u32 v7, v7, v29, v28 +; GFX10-NEXT: v_add3_u32 v22, v22, v26, v27 +; GFX10-NEXT: v_add3_u32 v7, v7, v24, v15 +; GFX10-NEXT: v_add_co_u32 v9, s4, v23, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5 -; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v21 +; GFX10-NEXT: v_add_co_u32 v6, s4, v9, v21 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v3, v5, v4, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v14 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v8 -; GFX10-NEXT: v_add3_u32 v5, v33, v10, v7 -; GFX10-NEXT: v_add3_u32 v3, v3, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v16 +; GFX10-NEXT: v_add3_u32 v5, v22, v10, v7 +; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v17 -; GFX10-NEXT: v_add3_u32 v7, v3, v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v3, v18 +; GFX10-NEXT: v_add3_u32 v7, v1, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v16 ; GFX10-NEXT: v_mov_b32_e32 v4, v19 ; GFX10-NEXT: v_mov_b32_e32 v5, v20 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll index 1e0d7e88bc27..16c48719bf1c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -413,12 +413,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f16_e32 v3, v1 ; GFX10-NEXT: v_rndne_f16_e32 v2, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_rndne_f16_e32 v3, v1 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v7 +; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 ; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 50fa5c749844..12b3b5409b62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4199,16 +4199,16 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4543,30 +4543,26 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v17, v2 -; GFX10-NEXT: v_mov_b32_e32 v18, v3 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v19, vcc_lo, v17, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 ; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -5327,7 +5323,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s1, 64, s0 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo @@ -5335,33 +5330,34 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX10-NEXT: v_or_b32_e32 v9, v16, v9 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; GFX10-NEXT: v_or_b32_e32 v8, v0, v8 +; GFX10-NEXT: v_or_b32_e32 v9, v1, v9 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 @@ -5569,64 +5565,60 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX10-LABEL: saddsat_i128_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_add_co_u32 v15, vcc_lo, v5, s0 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s2, 64, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[6:7] ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5959,28 +5951,20 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_mov_b32_e32 v20, v2 -; GFX10-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v0, v8 ; GFX10-NEXT: s_movk_i32 s5, 0x7f -; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v22, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: s_sub_i32 s6, 64, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: s_sub_i32 s7, s5, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23] -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v25, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0, v[8:9] ; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17] @@ -5991,7 +5975,6 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19] @@ -5999,33 +5982,34 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v19 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 ; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s4 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_add_co_u32 v8, s4, v26, v12 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v27, v13, s4 -; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v24, v14, s4 +; GFX10-NEXT: v_add_co_u32 v8, s4, v4, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v5, v13, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v6, v14, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v25, v15, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27] +; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v7, v15, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 ; GFX10-NEXT: v_cmp_gt_u64_e64 s4, 0, v[12:13] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11] @@ -6035,7 +6019,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_or_b32_e32 v13, v4, v13 ; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15] ; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11] @@ -6049,13 +6033,13 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_and_b32 s6, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7 -; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5 ; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 @@ -6592,23 +6576,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_addc_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_addc_u32 s30, s2, s10 +; GFX10-NEXT: s_addc_u32 s18, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_addc_u32 s31, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] -; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] +; GFX10-NEXT: s_addc_u32 s19, s3, s11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 @@ -6628,13 +6610,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_ashr_i32 s10, s31, 31 +; GFX10-NEXT: s_ashr_i32 s10, s19, 31 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21 +; GFX10-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s21 ; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9] @@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_mov_b32_e32 v3, s19 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -6669,7 +6651,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo ; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: s_addc_u32 s3, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 174df2d5a832..4dcbd7c9e092 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1571,12 +1571,8 @@ define <2 x i64> @v_shl_v2i64(<2 x i64> %value, <2 x i64> %amount) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[10:11] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[7:8] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i64> %value, %amount ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index e4858b872ee6..4e99dacabf41 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4185,16 +4185,16 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4529,30 +4529,26 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v17, v2 -; GFX10-NEXT: v_mov_b32_e32 v18, v3 +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v14, v4 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v19, vcc_lo, v17, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 ; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18] +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -5313,7 +5309,6 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s1, 64, s0 -; GFX10-NEXT: v_lshrrev_b64 v[15:16], s0, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo @@ -5321,33 +5316,34 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s1, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX10-NEXT: v_or_b32_e32 v9, v16, v9 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; GFX10-NEXT: v_or_b32_e32 v8, v0, v8 +; GFX10-NEXT: v_or_b32_e32 v9, v1, v9 +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i64 v[2:3], s1, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 @@ -5555,64 +5551,60 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; ; GFX10-LABEL: ssubsat_i128_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 -; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v5, s0 +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 ; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v19, vcc_lo, s2, v9, vcc_lo -; GFX10-NEXT: s_and_b32 s1, 1, s4 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v20, vcc_lo, s3, v10, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[15:16], v[5:6] +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v20 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[19:20], v[9:10] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[19:20], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: s_movk_i32 s0, 0x7f ; GFX10-NEXT: s_sub_i32 s2, 64, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v0, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[4:5] ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[15:16] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s2, v[19:20] ; GFX10-NEXT: s_sub_i32 s1, s0, 64 ; GFX10-NEXT: s_cmp_lt_u32 s0, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[8:9], s1, v[6:7] ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[19:20] +; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[6:7] ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 ; GFX10-NEXT: v_xor_b32_e32 v9, v11, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s0 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v20, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5945,28 +5937,20 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_mov_b32_e32 v20, v2 -; GFX10-NEXT: v_mov_b32_e32 v21, v3 +; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 ; GFX10-NEXT: s_movk_i32 s5, 0x7f -; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v22, v8 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: s_sub_i32 s6, 64, s5 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: s_sub_i32 s7, s5, 64 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v20, v10, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v21, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[22:23] -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] -; GFX10-NEXT: v_mov_b32_e32 v25, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[20:21] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], s6, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e32 v20, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] ; GFX10-NEXT: v_lshrrev_b64 v[0:1], s5, v[16:17] @@ -5977,7 +5961,6 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i64 v[0:1], s5, v[18:19] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: v_ashrrev_i64 v[8:9], s7, v[18:19] @@ -5985,33 +5968,34 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s8, 1, vcc_lo ; GFX10-NEXT: s_and_b32 s4, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v19 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 ; GFX10-NEXT: v_xor_b32_e32 v9, v10, v20 ; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, v1, s4 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v9 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s4, v26, v12 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v27, v13, s4 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v24, v14, s4 +; GFX10-NEXT: v_sub_co_u32 v8, s4, v4, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v5, v13, s4 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v6, v14, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v25, v15, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[26:27] +; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v7, v15, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v20, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], s5, v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], s6, v[10:11] @@ -6021,7 +6005,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_or_b32_e32 v13, v4, v13 ; GFX10-NEXT: v_ashrrev_i64 v[3:4], s5, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[24:25] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v5, s4 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, 0, v[14:15] ; GFX10-NEXT: v_ashrrev_i64 v[5:6], s7, v[10:11] @@ -6035,13 +6019,13 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: s_and_b32 s6, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 ; GFX10-NEXT: v_xor_b32_e32 v7, v14, v7 -; GFX10-NEXT: v_ashrrev_i32_e32 v18, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s4 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5 ; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7 @@ -6578,23 +6562,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s16, s0, s8 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_mov_b32 s46, s0 ; GFX10-NEXT: s_and_b32 s17, s17, 1 -; GFX10-NEXT: s_mov_b32 s47, s1 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_subb_u32 s17, s1, s9 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_and_b32 s18, s18, 1 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_subb_u32 s30, s2, s10 +; GFX10-NEXT: s_subb_u32 s18, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s19, s19, 1 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_subb_u32 s31, s3, s11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[30:31], s[2:3] -; GFX10-NEXT: s_cmp_eq_u64 s[30:31], s[2:3] +; GFX10-NEXT: s_subb_u32 s19, s3, s11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 @@ -6614,13 +6596,13 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_cselect_b32 s24, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[18:19], s22 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_ashr_i32 s10, s31, 31 +; GFX10-NEXT: s_ashr_i32 s10, s19, 31 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20 -; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21 +; GFX10-NEXT: s_ashr_i64 s[0:1], s[18:19], s20 +; GFX10-NEXT: s_ashr_i64 s[8:9], s[18:19], s21 ; GFX10-NEXT: s_cmp_lg_u32 s23, 0 ; GFX10-NEXT: s_mov_b32 s11, s10 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9] @@ -6641,7 +6623,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s31 +; GFX10-NEXT: v_mov_b32_e32 v3, s19 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -6655,7 +6637,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo ; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s30 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: s_subb_u32 s3, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index f6fc451f8060..8c1bc5fb57ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -176,22 +176,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s9, s6, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v15, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_lshr_b32 s0, s6, 24 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v19, s8 +; GFX10-NEXT: v_mov_b32_e32 v9, s8 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v15 offset:2 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:2 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:5 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:6 -; GFX10-NEXT: ds_write_b8 v1, v19 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 ; GFX10-NEXT: s_lshr_b32 s1, s7, 16 @@ -202,12 +202,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s2, s7, 24 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:12 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:13 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:14 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:15 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:15 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -286,7 +286,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 ; GFX10-NEXT: s_lshr_b32 s3, s7, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: v_mov_b32_e32 v11, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 @@ -294,7 +294,7 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b16 v1, v4 offset:12 -; GFX10-NEXT: ds_write_b16 v1, v11 offset:2 +; GFX10-NEXT: ds_write_b16 v1, v5 offset:2 ; GFX10-NEXT: ds_write_b16 v1, v6 offset:6 ; GFX10-NEXT: ds_write_b16 v1, v7 offset:10 ; GFX10-NEXT: ds_write_b16 v1, v8 offset:14 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 88277f4d2bdf..c96a98fe631f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -147,12 +147,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s3, s12, 24 ; GFX10-NEXT: s_lshr_b32 s6, s14, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s5 ; GFX10-NEXT: s_lshr_b32 s2, s13, 8 ; GFX10-NEXT: s_lshr_b32 s4, s13, 16 ; GFX10-NEXT: s_lshr_b32 s7, s14, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s14 -; GFX10-NEXT: v_mov_b32_e32 v11, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: s_lshr_b32 s8, s14, 24 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v10, s6 @@ -161,13 +161,13 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:1 -; GFX10-NEXT: ds_write_b8 v1, v11 offset:2 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:2 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:5 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:6 ; GFX10-NEXT: v_mov_b32_e32 v0, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: ds_write_b8 v1, v15 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 @@ -239,13 +239,13 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v3, s14 ; GFX10-NEXT: s_lshr_b32 s2, s14, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: ds_write_b16 v1, v0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b16 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b16 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b16 v1, v7 offset:6 +; GFX10-NEXT: ds_write_b16 v1, v5 offset:6 ; GFX10-NEXT: ds_write_b16 v1, v6 offset:10 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 0e23a1675782..681b8f0d1286 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2819,20 +2819,16 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v4 -; GFX10-NEXT: v_mov_b32_e32 v11, v5 -; GFX10-NEXT: v_mov_b32_e32 v15, v6 -; GFX10-NEXT: v_mov_b32_e32 v16, v7 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v10 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo -; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v2, v15 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11] -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, -1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -3203,22 +3199,22 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; ; GFX10-LABEL: uaddsat_i128_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, s0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[0:1] +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v10, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -3435,33 +3431,25 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v18, v8 -; GFX10-NEXT: v_mov_b32_e32 v19, v9 -; GFX10-NEXT: v_mov_b32_e32 v16, v10 -; GFX10-NEXT: v_mov_b32_e32 v17, v11 -; GFX10-NEXT: v_mov_b32_e32 v10, v12 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v18 -; GFX10-NEXT: v_mov_b32_e32 v11, v13 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v19, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v20, v14 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v16, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v21, v15 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v17, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[18:19] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v10 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v11, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v20, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v21, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[16:17] +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[10:11] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[20:21] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[16:17] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[20:21] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index f5c9bb56e780..b71703a2abcc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2689,16 +2689,12 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v10, v4 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v11, v5, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5] -; GFX10-NEXT: v_sub_co_u32 v4, s4, v0, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v1, v7, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[6:7] +; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_sub_co_u32 v4, s4, v2, v6 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v3, v7, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4 @@ -2974,7 +2970,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s10, s10, 1 ; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: s_subb_u32 s14, s2, s6 +; GFX10-NEXT: s_subb_u32 s10, s2, s6 ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10-NEXT: s_and_b32 s11, s11, 1 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 @@ -2989,7 +2985,7 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s14, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -3305,41 +3301,33 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_mov_b32_e32 v20, v2 -; GFX10-NEXT: v_mov_b32_e32 v21, v3 -; GFX10-NEXT: v_mov_b32_e32 v26, v4 -; GFX10-NEXT: v_mov_b32_e32 v27, v5 -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[22:23], v[8:9] -; GFX10-NEXT: v_mov_b32_e32 v24, v6 -; GFX10-NEXT: v_mov_b32_e32 v25, v7 +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[6:7], v[14:15] ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[20:21], v[10:11] -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[24:25], v[14:15] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[10:11] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[26:27], v[12:13] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX10-NEXT: v_and_b32_e32 v16, 1, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[24:25], v[14:15] +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v16 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v22, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v23, v9, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v20, v10, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v21, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v26, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v27, v13, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v24, v14, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v25, v15, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s5 @@ -3630,7 +3618,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_and_b32 s1, s1, 1 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13] -; GFX10-NEXT: s_subb_u32 s30, s6, s14 +; GFX10-NEXT: s_subb_u32 s10, s6, s14 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s0, s0, 1 @@ -3656,7 +3644,7 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_readfirstlane_b32 s2, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s30, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s3, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 921e0b35a8ae..5f28f31aff64 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -500,12 +500,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB2_2: @@ -551,11 +551,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB2_2: @@ -1680,12 +1680,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB9_2: @@ -1731,11 +1731,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB9_2: @@ -2534,12 +2534,12 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 +; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB14_2: @@ -2585,11 +2585,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 +; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB14_2: @@ -2768,12 +2768,12 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB15_2: @@ -2819,11 +2819,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB15_2: @@ -3002,12 +3002,12 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB16_2: @@ -3053,11 +3053,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB16_2: @@ -3238,12 +3238,12 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 +; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB17_2: @@ -3291,11 +3291,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 +; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB17_2: @@ -3655,12 +3655,12 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 +; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB19_2: @@ -3708,11 +3708,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 +; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB19_2: @@ -4070,12 +4070,12 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB21_2: @@ -4121,11 +4121,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB21_2: @@ -4480,12 +4480,12 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 +; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB23_2: @@ -4531,11 +4531,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 +; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB23_2: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 2781993221e0..765a68198216 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -92,7 +92,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX1064-NEXT: s_cbranch_execz BB0_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] @@ -101,7 +101,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i ; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX1064-NEXT: BB0_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 @@ -328,14 +328,14 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX1064-NEXT: s_mov_b64 exec, s[10:11] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_3 ; GFX1064-NEXT: ; %bb.2: ; GFX1064-NEXT: v_mov_b32_e32 v0, s12 ; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1064-NEXT: BB1_3: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index ab20b16624c0..8213a3700225 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -233,7 +233,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 27c5fa4b1d66..e0b30adc0627 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1121,7 +1121,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 @@ -1144,9 +1144,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; GFX10-NEXT: global_store_dword v11, v6, s[0:1] offset:24 -; GFX10-NEXT: global_store_dwordx2 v11, v[4:5], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v8, v6, s[0:1] offset:24 +; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index a1c8e48917d0..9dcffcdb7ca1 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -90,8 +90,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ds_write_b32 v3, v2 offset:12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0 -; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v4, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm entry: @@ -340,8 +340,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_fmas_f32 v7, s0, s0, s0 -; GFX10-NEXT: global_store_dword v[0:1], v7, off +; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v5, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll index 8c126869820b..1ec14d119dba 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll @@ -20,9 +20,7 @@ define { double, double } @testfn(double %arg, double %arg1, double %arg2) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[0:1] +; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[0:1] ; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] ; GFX10-NEXT: v_add_f64 v[2:3], -v[2:3], -v[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 39af8c11e6d2..e465320da0bd 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -970,11 +970,11 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v15, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v15, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v15, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -989,7 +989,7 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace( ; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v15, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 @@ -1141,10 +1141,10 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] @@ -1299,10 +1299,10 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], v[6:7], v[4:5], v[4:5] -; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] -; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[14:15], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] @@ -1893,49 +1893,49 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[15:16], v11, s[2:3] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6 +; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_trunc_f16_e32 v5, v5 +; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 ; GFX10-NEXT: v_rcp_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v6, v16, v1 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v16 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v4 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v10, v5 -; GFX10-NEXT: v_rcp_f32_e32 v7, v7 -; GFX10-NEXT: v_div_fixup_f16 v5, v10, v3, v4 -; GFX10-NEXT: v_trunc_f16_e32 v10, v5 -; GFX10-NEXT: v_fmac_f16_e64 v4, -v10, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX10-NEXT: v_trunc_f16_e32 v5, v5 +; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_and_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v15 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX10-NEXT: v_div_fixup_f16 v5, v5, v15, v0 +; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 -; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v15 +; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX10-NEXT: v_rcp_f32_e32 v7, v7 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 @@ -1943,7 +1943,7 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; GFX10-NEXT: v_fmac_f16_e64 v0, -v5, v2 ; GFX10-NEXT: v_and_b32_e32 v2, v3, v6 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm <4 x half> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 @@ -2161,11 +2161,11 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v11, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v11, s[2:3] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 @@ -2178,26 +2178,26 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX10-NEXT: v_fma_f32 v8, v9, v7, v8 ; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v6, v5, v7, v8 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v2, v0 -; GFX10-NEXT: v_div_fixup_f32 v5, v6, v3, v1 -; GFX10-NEXT: v_trunc_f32_e32 v6, v5 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1 ; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 -; GFX10-NEXT: v_fma_f32 v1, v3, -v6, v1 +; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; GFX10-NEXT: v_fma_f32 v6, v7, v6, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6 -; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v4 +; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX10-NEXT: v_fma_f32 v7, v8, v6, v7 -; GFX10-NEXT: v_fma_f32 v5, -v5, v7, v4 +; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v5, v6, v7 +; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2 -; GFX10-NEXT: global_store_dwordx2 v11, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm <2 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 @@ -2538,11 +2538,11 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[15:18], v8, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v18 -; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v18, v7, v18 +; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 +; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 ; GFX10-NEXT: v_rcp_f32_e32 v11, v10 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 @@ -2553,55 +2553,55 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 -; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v17, v6, v17 -; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v18 +; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9 -; GFX10-NEXT: v_fma_f32 v18, v7, -v9, v18 -; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v17 +; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3 +; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; GFX10-NEXT: v_fma_f32 v10, v11, v10, v10 -; GFX10-NEXT: v_mul_f32_e32 v11, v0, v10 -; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v0 +; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 ; GFX10-NEXT: v_fma_f32 v11, v12, v10, v11 -; GFX10-NEXT: v_fma_f32 v1, -v9, v11, v0 +; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v7, v1, v10, v11 -; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v17 +; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7 -; GFX10-NEXT: v_fma_f32 v17, v6, -v7, v17 -; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v16 -; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v16, v5, v16 +; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2 +; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 +; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 ; GFX10-NEXT: v_fma_f32 v9, v10, v9, v9 -; GFX10-NEXT: v_mul_f32_e32 v0, v6, v9 -; GFX10-NEXT: v_fma_f32 v11, -v7, v0, v6 -; GFX10-NEXT: v_fma_f32 v0, v11, v9, v0 -; GFX10-NEXT: v_fma_f32 v6, -v7, v0, v6 +; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX10-NEXT: v_fma_f32 v10, v11, v9, v10 +; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v0 -; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v16 +; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6 -; GFX10-NEXT: v_fma_f32 v16, v5, -v6, v16 -; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v15 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v15, v4, v15 +; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1 +; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 ; GFX10-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX10-NEXT: v_mul_f32_e32 v0, v5, v7 -; GFX10-NEXT: v_fma_f32 v10, -v6, v0, v5 -; GFX10-NEXT: v_fma_f32 v0, v10, v7, v0 -; GFX10-NEXT: v_fma_f32 v5, -v6, v0, v5 +; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX10-NEXT: v_fma_f32 v9, v10, v7, v9 +; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v0 -; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v15 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_fmac_f32_e64 v15, -v5, v4 -; GFX10-NEXT: global_store_dwordx4 v8, v[15:18], s[4:5] +; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm <4 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 @@ -2842,34 +2842,34 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[18:21], v16, s[2:3] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[20:21], v[20:21], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[18:19], v[18:19], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[20:21], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] ; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] -; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[20:21], v[2:3] +; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] -; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[20:21], v[2:3] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[18:19], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] ; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] -; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[18:19], v[0:1] +; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[18:19], v[0:1] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm <2 x double> addrspace(1)* %in2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 550fa502b1bf..2648fde7b6c9 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -843,31 +843,31 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX10-NEXT: v_and_b32_e32 v15, 15, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX10-NEXT: v_and_b32_e32 v19, 15, v6 -; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0 +; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 ; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7 -; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v19, v10 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 ; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX10-NEXT: v_or_b32_e32 v11, v6, v4 +; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10 +; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) @@ -1005,28 +1005,28 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11 ; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX10-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 ; GFX10-NEXT: v_and_b32_e32 v10, 15, v10 -; GFX10-NEXT: v_and_b32_e32 v15, 15, v8 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12 ; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0 ; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: v_or_b32_e32 v3, v7, v6 -; GFX10-NEXT: v_or_b32_e32 v7, v5, v4 +; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) @@ -1085,9 +1085,9 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { ; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 63, v5 +; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1172,18 +1172,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2 ; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v15, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v19, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v10 -; GFX10-NEXT: v_and_b32_e32 v13, 63, v11 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], v15, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[6:7], v9, v[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[15:16], v13, v[2:3] -; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 -; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 -; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %ret @@ -1331,10 +1331,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v7, 8, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 8, v5 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v7 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 2d5588379947..32e4f58df884 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -466,8 +466,8 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_mad_f32 v7, -v2, v0, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v2, s0 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, v2, s0 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -546,16 +546,16 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_mul_f32_e32 v8, v7, v1 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_trunc_f32_e32 v10, v8 -; GFX10-NEXT: v_mad_f32 v7, -v10, v0, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v10 +; GFX10-NEXT: v_trunc_f32_e32 v8, v8 +; GFX10-NEXT: v_mad_f32 v7, -v8, v0, v7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v2, v7 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -646,8 +646,8 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v0| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v2, v7 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -725,14 +725,14 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v7 +; GFX10-NEXT: v_cvt_f32_i32_e32 v5, v7 ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7 -; GFX10-NEXT: v_mul_f32_e32 v8, v11, v1 +; GFX10-NEXT: v_mul_f32_e32 v8, v5, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6 -; GFX10-NEXT: v_trunc_f32_e32 v10, v8 +; GFX10-NEXT: v_trunc_f32_e32 v8, v8 ; GFX10-NEXT: v_or_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_mad_f32 v5, -v10, v0, v11 -; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v10 +; GFX10-NEXT: v_mad_f32 v5, -v8, v0, v5 +; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v8 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v0| ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] @@ -742,8 +742,8 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, s1 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v2 -; GFX10-NEXT: global_store_short v[5:6], v7, off +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v7, v2 +; GFX10-NEXT: global_store_short v[5:6], v2, off ; GFX10-NEXT: s_cbranch_vccz BB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 36951b7f5929..7a3fea9b85d9 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2732,11 +2732,11 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v7, v5, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index e3b5f81b19c3..6f44f2aa7080 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -341,21 +341,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v10, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX10-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v6, v6, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_bfe_i32 v4, v8, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v10, v9, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v9, 0, 8 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -534,7 +534,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -543,7 +543,7 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -718,14 +718,14 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_i32_i24_e32 v5, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v7, v0, v5 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -908,13 +908,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v7, s2, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 5747d4f437b6..ad5a0a5bd65f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -327,17 +327,17 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1 -; GFX10-DL-NEXT: v_and_b32_e32 v10, s0, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 -; GFX10-DL-NEXT: v_mad_u16 v3, v4, v10, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -517,7 +517,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -526,7 +526,7 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -841,7 +841,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -850,7 +850,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v11, v4 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v7, v6, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -1025,17 +1025,17 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* % ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v7, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v0, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v7, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v4, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1215,14 +1215,14 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v7, v0, v5 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -1412,11 +1412,11 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v6, v0, v3, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v6 -; GFX10-DL-NEXT: v_add3_u32 v0, v6, v4, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v9, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v6, v9, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] @@ -1809,13 +1809,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v7, s2, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2230,7 +2230,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 ; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v3, v1, v2, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4 ; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2239,7 +2239,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 84a21ad4be4e..d0cde94b098c 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -644,26 +644,26 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 @@ -672,13 +672,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 @@ -686,13 +686,13 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -722,55 +722,55 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc16: @@ -1218,26 +1218,26 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 @@ -1246,13 +1246,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 @@ -1260,13 +1260,13 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1 ; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -1296,55 +1296,55 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8: @@ -1713,25 +1713,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2 -; GFX10-DL-XNACK-NEXT: v_bfe_i32 v15, v1, 16, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v12, v2, 20, 4 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v14, v0, v7, v5 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, v0, v7, v5 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_i32 v13, v2, 24, 4 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v15, v10 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v15, v14, v3, v4 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 ; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v6, v15, v8, v6 -; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v7, v1, v2 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v6, v3, v4 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v8, v6 +; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v7, v5 +; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5 ; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -1765,25 +1765,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2 -; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v15, v1, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v12, v0, 20, 4 ; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v2, v2, v7, v5 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v7, v1, 24, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v13, v0, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v15, v10 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v8, v8, v9 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v15, v2, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v6, v6, v10 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v3, v11, v12 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v7, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v1, 28, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v15, v8, v6 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v2, v2, v8, v6 ; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v0, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v3, v2, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v3, v0, v5 +; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_multiuses_mul1: @@ -2550,7 +2550,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v19, v2, 24, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v12, v2, 24, 4 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4 @@ -2577,7 +2577,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v19 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 @@ -2592,9 +2592,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v7, v1, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v7, v5 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 @@ -2638,7 +2638,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v19, v0, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v12, v0, 24, 4 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4 @@ -2665,7 +2665,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v19 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 @@ -2676,11 +2676,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v7, v0, v6 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v7, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 @@ -3196,7 +3196,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v19, 0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -3207,7 +3207,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v19, s[0:1] +; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) @@ -3250,8 +3250,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 @@ -3262,13 +3262,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v23, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v23 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -3284,12 +3284,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v23, v0 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-XNACK-NEXT: global_store_byte v19, v0, s[0:1] +; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: @@ -3297,7 +3297,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v19, 0 +; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 @@ -3308,7 +3308,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] -; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v19, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) @@ -3347,7 +3347,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v23, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 @@ -3360,7 +3360,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12 @@ -3390,7 +3390,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NOXNACK-NEXT: global_store_byte v19, v0, s[0:1] +; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index aa8fc5513980..d3bb2a4981de 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -494,31 +494,31 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -812,31 +812,31 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1134,31 +1134,31 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -1441,31 +1441,31 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v11, v5, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] @@ -2373,49 +2373,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 8, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v19, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v19 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 ; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v23, 28, v1 +; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 ; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4 ; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GFX10-DL-NEXT: v_add_nc_u16 v14, v3, v9 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v14, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v23, 16, v4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 @@ -2762,7 +2762,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v19, 0 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -2773,7 +2773,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v3, v19, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2794,7 +2794,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v23, v2, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15 ; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9 @@ -2804,7 +2804,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2 ; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v23 +; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12 ; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7 ; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2820,12 +2820,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v23, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v12, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7 ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: global_store_byte v19, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -3115,7 +3115,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 24, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 8, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7 @@ -3133,12 +3132,13 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v8 ; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v11, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v8 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index dedda14bf8d5..b4d0399831bb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -448,22 +448,22 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa ; ; GFX10-LABEL: load_3d_tfe_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -579,22 +579,22 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace ; ; GFX10-LABEL: load_cube_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -837,22 +837,22 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp ; ; GFX10-LABEL: load_2darray_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -968,22 +968,22 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp ; ; GFX10-LABEL: load_2dmsaa_both: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1361,22 +1361,22 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa ; ; GFX10-LABEL: load_mip_2d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index 0e0ea50e96e7..e39324874b33 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -566,10 +566,10 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -650,14 +650,14 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -707,9 +707,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -762,8 +762,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -804,10 +804,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -854,14 +854,14 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -911,9 +911,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -966,8 +966,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1162,8 +1162,8 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1196,8 +1196,8 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index 47765caa4090..90050524088d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -96,13 +96,13 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10-LABEL: image_sample_2d_f16_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, v5 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index f75fe13e49d0..6e9daac16d83 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -79,7 +79,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_1d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] @@ -92,7 +92,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] @@ -499,7 +499,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_1d_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] +; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] @@ -512,7 +512,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll index b4f89e5d9ba0..6a3248e50ee9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -15,12 +15,12 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -58,9 +58,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -139,12 +139,12 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -166,9 +166,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36] +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll index 002d4e69ad26..7c20bc69189f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -15,12 +15,12 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -58,9 +58,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -89,8 +89,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -139,12 +139,12 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -166,9 +166,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -197,8 +197,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 9edd1a397b78..e88b70fb449a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -356,8 +356,8 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D @@ -379,7 +379,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -388,7 +388,7 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB3_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 @@ -492,8 +492,8 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote -; GFX10-32-NEXT: s_wqm_b32 s28, s12 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 @@ -515,7 +515,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_waitcnt vmcnt(0) ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc -; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] ; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec @@ -524,7 +524,7 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] ; GFX10-64-NEXT: BB4_3: ; %.continue -; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D @@ -637,8 +637,8 @@ define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32 ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] ; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 ; GFX10-64-NEXT: ; %bb.1: ; %.entry -; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29] +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index bdeda3e4f04b..9b2f8aa23273 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -157,25 +157,25 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u32 v15, v0, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX10-NEXT: v_mul_hi_u32 v5, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v6, v0, v3 ; GFX10-NEXT: v_mul_lo_u32 v8, v1, v2 ; GFX10-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX10-NEXT: v_mul_hi_i32 v9, v1, v3 ; GFX10-NEXT: v_mul_lo_u32 v11, v1, v3 -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v5, v15 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v5, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v10, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v11 +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v11, v2 +; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v6, v2 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_add3_u32 v1, v5, v15, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v5, v4, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v6, v0 @@ -461,8 +461,8 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] ; GFX10-NEXT: v_alignbit_b32 v3, v1, v0, 30 -; GFX10-NEXT: v_ashrrev_i64 v[6:7], 2, v[4:5] -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] +; GFX10-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 4f48c06fa1ce..d4fa0b3386b2 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -539,15 +539,15 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index d9962d2fbceb..d686af2f1db3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -330,12 +330,12 @@ define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrs ; GCN-SCRATCH: ; %bb.0: ; %entry ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, v2 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, v2 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: global_load_short_d16_hi v6, v[0:1], off +; GCN-SCRATCH-NEXT: global_load_short_d16_hi v5, v[0:1], off ; GCN-SCRATCH-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:64 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v6, off +; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v5, off ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v2, off offset:128 ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 @@ -373,12 +373,12 @@ define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrs ; GCN-SCRATCH: ; %bb.0: ; %entry ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, v2 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, v2 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: global_load_short_d16 v6, v[0:1], off +; GCN-SCRATCH-NEXT: global_load_short_d16 v5, v[0:1], off ; GCN-SCRATCH-NEXT: global_load_short_d16 v2, v[0:1], off offset:64 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v6, off +; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v5, off ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: global_store_dword v[3:4], v2, off offset:128 ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir deleted file mode 100644 index 8862644d2264..000000000000 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign-split.mir +++ /dev/null @@ -1,38 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s - ---- | - define amdgpu_kernel void @do_not_reassign_spill() #0 { ret void } - - attributes #0 = { "amdgpu-num-vgpr"="8" } -... - -# GCN-LABEL: do_not_reassign_spill{{$}} -# GCN: V_AND_B32_e32 killed $vgpr1, killed $vgpr5, ---- -name: do_not_reassign_spill -tracksRegLiveness: true -machineFunctionInfo: - stackPtrOffsetReg: $sgpr32 -stack: - - { id: 0, type: default, offset: 0, size: 4, alignment: 4 } -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' } - - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' } - - { id: 4, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 5, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 6, class: vgpr_32 } -body: | - bb.0: - %0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %3 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %4 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - %5 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) - S_NOP 0, implicit-def dead $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5 - %6 = V_AND_B32_e32 %1, %5, implicit $exec - S_ENDPGM 0, implicit %6 -... diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir deleted file mode 100644 index 918e009b3bf6..000000000000 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir +++ /dev/null @@ -1,69 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s - - -# Test that subreg reassignments are correctly handled when whole register also -# conflicts. If this is mishandled stall counts will be incorrect and cause an -# infinite loop. -# GCN-LABEL: vgpr64_mixed_use{{$}} -# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF -# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF -# GCN: $vcc = IMPLICIT_DEF -# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF -# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF -# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF -# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF -# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF -# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF -# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF -# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF -# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF -# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF -# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF -# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF -# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec -# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec -# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 killed $vgpr4_vgpr5, killed $vgpr0_vgpr1, implicit $exec ---- -name: vgpr64_mixed_use -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' } - - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' } - - { id: 3, class: vgpr_32 } - - { id: 4, class: vgpr_32 } - - { id: 5, class: sreg_64_xexec } - - { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } - - { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } - - { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } - - { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } - - { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } - - { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %13 = IMPLICIT_DEF - %14 = IMPLICIT_DEF - %15 = IMPLICIT_DEF - %16 = IMPLICIT_DEF - %17 = IMPLICIT_DEF - %3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec - %4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec - %5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec - S_ENDPGM 0 -... diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir deleted file mode 100644 index df057da98c2b..000000000000 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ /dev/null @@ -1,611 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s - -# GCN-LABEL: v1_vs_v5{{$}} -# GCN: V_AND_B32_e32 killed $vgpr3, killed $vgpr1, ---- -name: v1_vs_v5 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: v0_1_vs_v4{{$}} -# GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr3, ---- -name: v0_1_vs_v4 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %1, %0, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: v1_2_vs_v4_5{{$}} -# GCN: GLOBAL_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5, ---- -name: v1_2_vs_v4_5 -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64, preferred-register: '$vgpr4_vgpr5' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr1_vgpr2' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: s11_vs_vcc{{$}} -# GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed $sgpr14, killed $vgpr0, killed $vcc_lo, 0 ---- -name: s11_vs_vcc -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr11' } - - { id: 1, class: vgpr_32 } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - $vcc_lo = IMPLICIT_DEF - %2, $vcc_lo = V_ADDC_U32_e64 killed %0, killed %1, killed $vcc_lo, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: s0_vs_s16{{$}} -# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr0, ---- -name: s0_vs_s16 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr0 = IMPLICIT_DEF - %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s1_vs_s16{{$}} -# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr1, ---- -name: s1_vs_s16 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr1 = IMPLICIT_DEF - %1 = S_AND_B32 %0, $sgpr1, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s12_vs_null{{$}} -# GCN: S_AND_B32 $sgpr_null, killed renamable $sgpr14, ---- -name: s12_vs_null -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr12' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = S_AND_B32 $sgpr_null, %0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s13_vs_m0{{$}} -# GCN: S_AND_B32 $m0, killed renamable $sgpr14, ---- -name: s13_vs_m0 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr13' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = S_AND_B32 $m0, %0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: s12_13_vs_s28_s29{{$}} -# GCN: S_AND_B64 $sgpr28_sgpr29, killed renamable $sgpr14_sgpr15, ---- -name: s12_13_vs_s28_s29 -tracksRegLiveness: true -registers: - - { id: 0, class: sreg_64, preferred-register: '$sgpr12_sgpr13' } - - { id: 1, class: sreg_64 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr28_sgpr29 = IMPLICIT_DEF - %1 = S_AND_B64 $sgpr28_sgpr29, %0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: livein{{$}} -# GCN: V_AND_B32_e32 killed $vgpr4, killed $vgpr0, ---- -name: livein -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32 } -liveins: - - { reg: '$vgpr0', virtual-reg: '' } - - { reg: '$vgpr4', virtual-reg: '' } -body: | - bb.0: - liveins: $vgpr0, $vgpr4 - - %0 = COPY $vgpr0 - %1 = COPY $vgpr4 - %2 = V_AND_B32_e32 %1, %0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: liveout{{$}} -# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, ---- -name: liveout -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - $vgpr0 = COPY %0 - $vgpr4 = COPY %1 - S_ENDPGM 0 -... - -# GCN-LABEL: implicit{{$}} -# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0 ---- -name: implicit -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_128 } - - { id: 1, class: vreg_128, preferred-register: '$vgpr4_vgpr5_vgpr6_vgpr7' } -body: | - bb.0: - %1 = IMPLICIT_DEF - V_MOV_B32_indirect undef %1.sub0:vreg_128, undef $vgpr0, implicit $exec, implicit-def %0:vreg_128, implicit %1:vreg_128, implicit $m0 - S_ENDPGM 0 -... - -# GCN-LABEL: occupancy_limit{{$}} -# GCN: V_AND_B32_e32 $vgpr4, $vgpr0, ---- -name: occupancy_limit -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } - - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } - - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } - - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } - - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } - - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } - - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %3 = IMPLICIT_DEF - %4 = IMPLICIT_DEF - %5 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %13 = IMPLICIT_DEF - %14 = IMPLICIT_DEF - %15 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: csr{{$}} -# GCN: V_AND_B32_e32 $vgpr37, $vgpr0, ---- -name: csr -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } - - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } - - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } - - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 12, class: vgpr_32, preferred-register: '$vgpr33' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %3 = IMPLICIT_DEF - %4 = IMPLICIT_DEF - %5 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - GLOBAL_STORE_DWORD %3, %0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %1, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %2, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %4, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %12, 0, 0, implicit $exec - S_ENDPGM 0 -... - -# Do not touch undefs -# GCN-LABEL: s0_vs_s16_undef{{$}} -# GCN: S_AND_B32 killed renamable $sgpr16, undef $sgpr0, ---- -name: s0_vs_s16_undef -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = S_AND_B32 %0, undef $sgpr0, implicit-def $scc - S_ENDPGM 0 -... - -# GCN-LABEL: smem_bundle{{$}} -# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0 -# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0 ---- -name: smem_bundle -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_128, preferred-register: '$sgpr0_sgpr1_sgpr2_sgpr3' } - - { id: 1, class: sreg_32_xm0_xexec, preferred-register: '$sgpr16' } - - { id: 2, class: sreg_32_xm0_xexec, preferred-register: '$sgpr17' } - - { id: 3, class: sreg_32_xm0_xexec, preferred-register: '$sgpr4' } - - { id: 4, class: sreg_32_xm0_xexec, preferred-register: '$sgpr5' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - early-clobber %3, early-clobber %4 = BUNDLE %0, %1, %2 { - %3 = S_BUFFER_LOAD_DWORD_SGPR %0, %1, 0 - %4 = S_BUFFER_LOAD_DWORD_SGPR %0, %2, 0 - } - S_ENDPGM 0 -... - -# GCN-LABEL: vreg_512_subs{{$}} -# don't care about the assignment: this used to trigger an infinite loop ---- -name: vreg_512_subs -tracksRegLiveness: true -registers: - - { id: 1, class: vreg_512, preferred-register: '$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15' } - - { id: 2, class: vgpr_32, preferred-register: '$vgpr28' } -body: | - bb.0: - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - DS_WRITE2_B32_gfx9 %2, %1.sub0, %1.sub1, 0, 1, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub2, %1.sub3, 2, 3, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec - DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: vgpr_lo16_sub{{$}} -# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec -# GCN: renamable $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16 ---- -name: vgpr_lo16_sub -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } - - { id: 3, class: vgpr_lo16 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - %3 = COPY %2.lo16 - $vgpr1_lo16 = COPY %3 - SI_RETURN_TO_EPILOG $vgpr1_lo16 -... - -# GCN-LABEL: vgpr_lo16{{$}} -# GCN: $vgpr1_lo16 = COPY killed renamable $vgpr0_lo16 ---- -name: vgpr_lo16 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_lo16, preferred-register: '$vgpr4_lo16' } -body: | - bb.0: - liveins: $vgpr0_lo16 - - %0 = COPY $vgpr0_lo16 - $vgpr1_lo16 = COPY %0 - SI_RETURN_TO_EPILOG $vgpr1_lo16 -... - -# GCN-LABEL: vgpr_hi16_sub{{$}} -# GCN: renamable $vgpr0 = V_AND_B32_e32 killed $vgpr3, killed $vgpr1, implicit $exec -# GCN: renamable $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16 ---- -name: vgpr_hi16_sub -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } - - { id: 3, class: vgpr_hi16 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - %3 = COPY %2.hi16 - $vgpr1_hi16 = COPY %3 - SI_RETURN_TO_EPILOG $vgpr1_hi16 -... - -# GCN-LABEL: vgpr_hi16{{$}} -# GCN: $vgpr1_hi16 = COPY killed renamable $vgpr0_hi16 ---- -name: vgpr_hi16 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_hi16, preferred-register: '$vgpr4_hi16' } -body: | - bb.0: - liveins: $vgpr0_hi16 - - %0 = COPY $vgpr0_hi16 - $vgpr1_hi16 = COPY %0 - SI_RETURN_TO_EPILOG $vgpr1_hi16 -... - -# GCN-LABEL: sgpr_lo16_sub{{$}} -# GCN: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr14, $sgpr0, implicit-def $scc -# GCN: renamable $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16 ---- -name: sgpr_lo16_sub -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' } - - { id: 1, class: sgpr_32 } - - { id: 2, class: sgpr_lo16 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr0 = IMPLICIT_DEF - %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc - %2 = COPY %1.lo16 - $sgpr1_lo16 = COPY %2 - SI_RETURN_TO_EPILOG $sgpr1_lo16 -... - -# GCN-LABEL: sgpr_lo16{{$}} -# GCN: $sgpr1_lo16 = COPY killed renamable $sgpr0_lo16 ---- -name: sgpr_lo16 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_lo16, preferred-register: '$sgpr4_lo16' } -body: | - bb.0: - liveins: $sgpr0_lo16 - - %0 = COPY $sgpr0_lo16 - $sgpr1_lo16 = COPY %0 - SI_RETURN_TO_EPILOG $sgpr1_lo16 -... - -# Check that we do not use VGPR3 which we would use otherwise. -# We cannot use it because of interference with VGPR3_LO16. -# GCN-LABEL: v1_vs_v5_src_interence{{$}} -# GCN: V_AND_B32_e32 killed $vgpr7, killed $vgpr1, ---- -name: v1_vs_v5_src_interence -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - $vgpr3_lo16 = IMPLICIT_DEF - %2 = V_AND_B32_e32 %1, %0, implicit $exec - S_ENDPGM 0 -... - -# Test that bank of subreg is considered during scavenging. -# If handled incorrectly an infinite loop occurs. -# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}} -# GCN: S_AND_B32 killed renamable $sgpr13, $sgpr0, ---- -name: s0_vs_s15_16_17_sub1 -tracksRegLiveness: true -registers: - - { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' } - - { id: 1, class: sgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - $sgpr0 = IMPLICIT_DEF - %1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc - S_ENDPGM 0 -... - -# Test that the size of subreg is correctly handled in bank calculation. -# If handled incorrectly an infinite loop occurs. -# GCN-LABEL: vgpr_sub_dependence{{$}} -# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF -# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF -# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF -# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF -# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF -# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF -# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF -# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF -# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF -# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF -# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF -# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF -# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF -# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF -# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec -# GCN: $vgpr0_vgpr1 = V_ADD_F64_e64 0, killed $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec ---- -name: vgpr_sub_dependence -tracksRegLiveness: true -registers: - - { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' } - - { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' } - - { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' } - - { id: 3, class: vreg_64 } - - { id: 4, class: vreg_64 } - - { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } - - { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' } - - { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' } - - { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' } - - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } - - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } - - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } - - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } - - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } - - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } - - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2 = IMPLICIT_DEF - %5 = IMPLICIT_DEF - %6 = IMPLICIT_DEF - %7 = IMPLICIT_DEF - %8 = IMPLICIT_DEF - %9 = IMPLICIT_DEF - %10 = IMPLICIT_DEF - %11 = IMPLICIT_DEF - %12 = IMPLICIT_DEF - %13 = IMPLICIT_DEF - %14 = IMPLICIT_DEF - %15 = IMPLICIT_DEF - %3 = V_ADD_F64_e64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec - %4 = V_ADD_F64_e64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec - S_ENDPGM 0 -... - -# GCN-LABEL: dbg_value_v1_v5{{$}} -# GCN: renamable $vgpr1 = IMPLICIT_DEF -# GCN: renamable $vgpr5 = IMPLICIT_DEF ---- -name: dbg_value_v1_v5 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - DBG_VALUE debug-use %1, debug-use %0 - S_ENDPGM 0, implicit %0, implicit %1 -... - -# GCN-LABEL: kill_v1_v5{{$}} -# GCN: renamable $vgpr1 = IMPLICIT_DEF -# GCN: renamable $vgpr5 = IMPLICIT_DEF -# GCN: KILL killed renamable $vgpr5, killed renamable $vgpr1 ---- -name: kill_v1_v5 -tracksRegLiveness: true -registers: - - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' } - - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' } - - { id: 2, class: vgpr_32 } -body: | - bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - KILL %1, %0 - S_ENDPGM 0 -... diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index e789388325de..33526c956a8e 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -458,16 +458,16 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[9:10], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v9, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v10, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[7:8], v[9:10] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] ; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: global_store_dwordx2 v6, v[7:8], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] ; GFX10-NEXT: global_store_byte v6, v0, s[6:7] ; GFX10-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 @@ -575,14 +575,14 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_i32 v5, v1, v3 clamp -; GFX10-NEXT: v_add_nc_u32_e32 v10, v1, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_add_nc_i32 v6, v0, v2 clamp -; GFX10-NEXT: v_add_nc_u32_e32 v9, v0, v2 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v6 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v0, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v4, v[9:10], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX10-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 812049942783..3c2b66c302c1 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -486,17 +486,17 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11] -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 480b0269ea95..1c7c1db25923 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -544,15 +544,15 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] offset:8 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 4764fad87b46..fde23b00aec5 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -1100,17 +1100,17 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[10:11] -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v6, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index 831bf871e7b6..cf2f5577df4b 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -230,12 +230,12 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_lshr_b32 s1, s7, 24 ; GFX10-NEXT: s_lshr_b32 s5, s5, 24 -; GFX10-NEXT: v_mov_b32_e32 v15, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 @@ -243,8 +243,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v11 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v11 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 @@ -252,7 +252,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: ds_write_b8 v0, v15 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 @@ -351,15 +351,15 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s4 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 ; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 ; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 -; GFX10-NEXT: ds_write_b16 v0, v7 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v7 offset:2 +; GFX10-NEXT: ds_write_b16 v0, v4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void @@ -420,9 +420,9 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v6, s7 +; GFX10-NEXT: v_mov_b32_e32 v4, s7 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX10-NEXT: ds_write2_b32 v0, v3, v6 offset0:2 offset1:3 +; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 6babc931aedb..d54d41824c7c 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -196,11 +196,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: s_lshr_b32 s5, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v11, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: v_mov_b32_e32 v9, s4 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 @@ -208,11 +208,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX10-NEXT: ds_write_b8 v0, v3 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v11 offset:9 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 -; GFX10-NEXT: ds_write_b8 v0, v15 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 ; GFX10-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll index af94bd4a1f25..97412b1e4c26 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll @@ -65,12 +65,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -88,12 +84,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_ignore(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val @@ -111,12 +103,8 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_maytrap(<2 x double> %x, < ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll index 89493343c6fc..110e65144e0d 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -75,10 +75,9 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-NEXT: v_fmac_f16_e32 v6, v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val @@ -128,23 +127,21 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v14, v5 -; GFX10-NEXT: v_mov_b32_e32 v15, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v15 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX10-NEXT: v_fmac_f16_e32 v15, v0, v2 +; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_fmac_f16_e32 v14, v1, v3 -; GFX10-NEXT: v_fmac_f16_e32 v5, v8, v7 -; GFX10-NEXT: v_fmac_f16_e32 v4, v11, v10 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v15 -; GFX10-NEXT: v_and_b32_e32 v2, v0, v14 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 +; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 +; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v0, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll index 067640c4cb1c..38077938fd7d 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll @@ -31,12 +31,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v13, v3 -; GFX10-NEXT: v_mov_b32_e32 v12, v2 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_fma_f64 v[2:3], v[12:13], v[6:7], v[10:11] -; GFX10-NEXT: v_fma_f64 v[0:1], v[14:15], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -77,18 +73,10 @@ define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v29, v7 -; GFX10-NEXT: v_mov_b32_e32 v28, v6 -; GFX10-NEXT: v_mov_b32_e32 v31, v5 -; GFX10-NEXT: v_mov_b32_e32 v30, v4 -; GFX10-NEXT: v_mov_b32_e32 v25, v3 -; GFX10-NEXT: v_mov_b32_e32 v24, v2 -; GFX10-NEXT: v_mov_b32_e32 v27, v1 -; GFX10-NEXT: v_mov_b32_e32 v26, v0 -; GFX10-NEXT: v_fma_f64 v[4:5], v[30:31], v[12:13], v[20:21] -; GFX10-NEXT: v_fma_f64 v[6:7], v[28:29], v[14:15], v[22:23] -; GFX10-NEXT: v_fma_f64 v[2:3], v[24:25], v[10:11], v[18:19] -; GFX10-NEXT: v_fma_f64 v[0:1], v[26:27], v[8:9], v[16:17] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x double> %val @@ -162,12 +150,8 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x doubl ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v13, v3 -; GFX10-NEXT: v_mov_b32_e32 v12, v2 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_fma_f64 v[2:3], -v[12:13], -v[6:7], v[10:11] -; GFX10-NEXT: v_fma_f64 v[0:1], -v[14:15], -v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9] +; GFX10-NEXT: v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg <2 x double> %x %neg.y = fneg <2 x double> %y diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll index daa7dcc8344d..9fc32fa3556c 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll @@ -65,12 +65,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -88,12 +84,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_ignore(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val @@ -111,12 +103,8 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_maytrap(<2 x double> %x, < ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mul_f64 v[2:3], v[8:9], v[6:7] -; GFX10-NEXT: v_mul_f64 v[0:1], v[10:11], v[4:5] +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll index 8e4e406ccf50..115d52ef838c 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll @@ -65,12 +65,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_strict(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x double> %val @@ -88,12 +84,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_ignore(<2 x double> %x, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %val @@ -111,12 +103,8 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_maytrap(<2 x double> %x, < ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], -v[6:7] -; GFX10-NEXT: v_add_f64 v[0:1], v[10:11], -v[4:5] +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x double> %val diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 30beac73efd1..3500090e8455 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -746,11 +746,11 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -778,15 +778,15 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[9:10], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v10, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v9, 16, v2 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -816,12 +816,12 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1319,14 +1319,14 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dwordx2 v[7:8], v6, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] ; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_fma_f16 v4, v7, v2, v4 op_sel_hi:[0,1,1] -; GFX10-NEXT: v_pk_fma_f16 v2, v8, v2, v5 op_sel_hi:[0,1,1] -; GFX10-NEXT: v_pk_fma_f16 v0, v7, v3, v4 op_sel:[1,0,0] -; GFX10-NEXT: v_pk_fma_f16 v1, v8, v3, v2 op_sel:[1,0,0] +; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] +; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] +; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] ; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm entry: @@ -1380,14 +1380,16 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v1, v3, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v7, 16, v2 +; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1