[AMDGPU] Pre-allocate WWM registers to reduce VGPR pressure.

This change incorporates an effort by Connor Abbot to change how we deal with WWM operations potentially trashing valid values in inactive lanes. Previously, the SIFixWWMLiveness pass would work out which registers were being trashed within WWM regions, and ensure that the register allocator did not have any values it was depending on resident in those registers if the WWM section would trash them. This worked perfectly well, but would cause sometimes severe register pressure when the WWM section resided before divergent control flow (or at least that is where I mostly observed it). This fix instead runs through the WWM sections and pre allocates some registers for WWM. It then reserves these registers so that the register allocator cannot use them. This results in a significant register saving on some WWM shaders I'm working with (130 -> 104 VGPRs, with just this change!). Differential Revision: https://reviews.llvm.org/D59295 llvm-svn: 357400
2019-04-01 15:19:52 +00:00 · 2019-04-01 15:19:52 +00:00 · 0a30f33ce2
parent d8519f4a7d
commit 0a30f33ce2
19 changed files with 475 additions and 640 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -51,7 +51,7 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
-FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
 FunctionPass *createAMDGPUUseNativeCallsPass();
@ -148,8 +148,8 @@ extern char &SIInsertSkipsPassID;
 void initializeSIOptimizeExecMaskingPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingID;

-void initializeSIFixWWMLivenessPass(PassRegistry &);
-extern char &SIFixWWMLivenessID;
+void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
+extern char &SIPreAllocateWWMRegsID;

 void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
 extern char &AMDGPUSimplifyLibCallsID;
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -208,7 +208,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
  initializeSIInsertSkipsPass(*PR);
  initializeSIMemoryLegalizerPass(*PR);
  initializeSIOptimizeExecMaskingPass(*PR);
-  initializeSIFixWWMLivenessPass(*PR);
+  initializeSIPreAllocateWWMRegsPass(*PR);
  initializeSIFormMemoryClausesPass(*PR);
  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
  initializeAMDGPUAAWrapperPassPass(*PR);
@ -879,9 +879,8 @@ void GCNPassConfig::addFastRegAlloc() {
  // SI_ELSE will introduce a copy of the tied operand source after the else.
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

-  // This must be run after SILowerControlFlow, since it needs to use the
-  // machine-level CFG, but before register allocation.
-  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+  // This must be run just after RegisterCoalescing.
+  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);

  TargetPassConfig::addFastRegAlloc();
 }
@ -899,9 +898,8 @@ void GCNPassConfig::addOptimizedRegAlloc() {
  // SI_ELSE will introduce a copy of the tied operand source after the else.
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

-  // This must be run after SILowerControlFlow, since it needs to use the
-  // machine-level CFG, but before register allocation.
-  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+  // This must be run just after RegisterCoalescing.
+  insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);

  TargetPassConfig::addOptimizedRegAlloc();
 }
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -95,7 +95,7 @@ add_llvm_target(AMDGPUCodeGen
  SIFixSGPRCopies.cpp
  SIFixupVectorISel.cpp
  SIFixVGPRCopies.cpp
-  SIFixWWMLiveness.cpp
+  SIPreAllocateWWMRegs.cpp
  SIFoldOperands.cpp
  SIFormMemoryClauses.cpp
  SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@ -1,417 +0,0 @@
-//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Computations in WWM can overwrite values in inactive channels for
-/// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to their def(s) to make sure that they aren't
-/// overwritten.
-///
-/// As an example, consider this snippet:
-/// %vgpr0 = V_MOV_B32_e32 0.0
-/// if (...) {
-///   %vgpr1 = ...
-///   %vgpr2 = WWM killed %vgpr1
-///   ... = killed %vgpr2
-///   %vgpr0 = V_MOV_B32_e32 1.0
-/// }
-/// ... = %vgpr0
-///
-/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
-/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
-/// writing %vgpr1 would only write to channels that would be clobbered by the
-/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
-/// it would clobber even the inactive channels for which the if-condition is
-/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to its def to make sure they aren't allocated to the
-/// same register.
-///
-/// In general, we need to figure out what registers might have their inactive
-/// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We do that by spotting three separate cases of registers:
-///
-/// 1. A "then phi": the value resulting from phi elimination of a phi node at
-///    the end of an if..endif. If there is WWM code in the "then", then we
-///    make the def at the end of the "then" branch a partial def by adding an
-///    implicit use of the register.
-///
-/// 2. A "loop exit register": a value written inside a loop but used outside the
-///    loop, where there is WWM code inside the loop (the case in the example
-///    above). We add an implicit_def of the register in the loop pre-header,
-///    and make the original def a partial def by adding an implicit use of the
-///    register.
-///
-/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
-///    in a loop header. If there is WWM code inside the loop, then we make all
-///    defs inside the loop partial defs by adding an implicit use of the
-///    register on each one.
-///
-/// Note that we do not need to consider an if..else..endif phi. We only need to
-/// consider non-uniform control flow, and control flow structurization would
-/// have transformed a non-uniform if..else..endif into two if..endifs.
-///
-/// The analysis to detect these cases relies on a property of the MIR
-/// arising from this pass running straight after PHIElimination and before any
-/// coalescing: that any virtual register with more than one definition must be
-/// the new register added to lower a phi node by PHIElimination.
-///
-/// FIXME: We should detect whether a register in one of the above categories is
-/// already live at the WWM code before deciding to add the implicit uses to
-/// synthesize its liveness.
-///
-/// FIXME: I believe this whole scheme may be flawed due to the possibility of
-/// the register allocator doing live interval splitting.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-fix-wwm-liveness"
-
-namespace {
-
-class SIFixWWMLiveness : public MachineFunctionPass {
-private:
-  MachineDominatorTree *DomTree;
-  MachineLoopInfo *LoopInfo;
-  LiveIntervals *LIS = nullptr;
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-
-  std::vector<MachineInstr *> WWMs;
-  std::vector<MachineOperand *> ThenDefs;
-  std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
-  std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
-
-public:
-  static char ID;
-
-  SIFixWWMLiveness() : MachineFunctionPass(ID) {
-    initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredID(MachineDominatorsID);
-    AU.addRequiredID(MachineLoopInfoID);
-    // Should preserve the same set that TwoAddressInstructions does.
-    AU.addPreserved<SlotIndexes>();
-    AU.addPreserved<LiveIntervals>();
-    AU.addPreservedID(LiveVariablesID);
-    AU.addPreservedID(MachineLoopInfoID);
-    AU.addPreservedID(MachineDominatorsID);
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-private:
-  void processDef(MachineOperand &DefOpnd);
-  bool processThenDef(MachineOperand *DefOpnd);
-  bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
-  bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
-                "SI fix WWM liveness", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
-                "SI fix WWM liveness", false, false)
-
-char SIFixWWMLiveness::ID = 0;
-
-char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
-
-FunctionPass *llvm::createSIFixWWMLivenessPass() {
-  return new SIFixWWMLiveness();
-}
-
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
-  bool Modified = false;
-
-  // This doesn't actually need LiveIntervals, but we can preserve them.
-  LIS = getAnalysisIfAvailable<LiveIntervals>();
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
-  TII = ST.getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-
-  DomTree = &getAnalysis<MachineDominatorTree>();
-  LoopInfo = &getAnalysis<MachineLoopInfo>();
-
-  // Scan the function to find the WWM sections and the candidate registers for
-  // having liveness modified.
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (MI.getOpcode() == AMDGPU::EXIT_WWM)
-        WWMs.push_back(&MI);
-      else {
-        for (MachineOperand &DefOpnd : MI.defs()) {
-          if (DefOpnd.isReg()) {
-            unsigned Reg = DefOpnd.getReg();
-            if (TRI->isVGPR(*MRI, Reg))
-              processDef(DefOpnd);
-          }
-        }
-      }
-    }
-  }
-  if (!WWMs.empty()) {
-    // Synthesize liveness over WWM sections as required.
-    for (auto ThenDef : ThenDefs)
-      Modified |= processThenDef(ThenDef);
-    for (auto LoopExitDef : LoopExitDefs)
-      Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
-    for (auto LoopPhiDef : LoopPhiDefs)
-      Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
-  }
-
-  WWMs.clear();
-  ThenDefs.clear();
-  LoopExitDefs.clear();
-  LoopPhiDefs.clear();
-
-  return Modified;
-}
-
-// During the function scan, process an operand that defines a VGPR.
-// This categorizes the register and puts it in the appropriate list for later
-// use when processing a WWM section.
-void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
-  unsigned Reg = DefOpnd.getReg();
-  // Get all the defining instructions. For convenience, make Defs[0] the def
-  // we are on now.
-  SmallVector<const MachineInstr *, 4> Defs;
-  Defs.push_back(DefOpnd.getParent());
-  for (auto &MI : MRI->def_instructions(Reg)) {
-    if (&MI != DefOpnd.getParent())
-      Defs.push_back(&MI);
-  }
-  // Check whether this def dominates all the others. If not, ignore this def.
-  // Either it is going to be processed when the scan encounters its other def
-  // that dominates all defs, or there is no def that dominates all others.
-  // The latter case is an eliminated phi from an if..else..endif or similar,
-  // which must be for uniform control flow so can be ignored.
-  // Because this pass runs shortly after PHIElimination, we assume that any
-  // multi-def register is a lowered phi, and thus has each def in a separate
-  // basic block.
-  for (unsigned I = 1; I != Defs.size(); ++I) {
-    if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
-      return;
-  }
-  // Check for the case of an if..endif lowered phi: It has two defs, one
-  // dominates the other, and there is a single use in a successor of the
-  // dominant def.
-  // Later we will spot any WWM code inside
-  // the "then" clause and turn the second def into a partial def so its
-  // liveness goes through the WWM code in the "then" clause.
-  if (Defs.size() == 2) {
-    auto DomDefBlock = Defs[0]->getParent();
-    if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
-      auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
-      for (auto Succ : DomDefBlock->successors()) {
-        if (Succ == UseBlock) {
-          LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
-          ThenDefs.push_back(&DefOpnd);
-          return;
-        }
-      }
-    }
-  }
-  // Check for the case of a non-lowered-phi register (single def) that exits
-  // a loop, that is, it has a use that is outside a loop that the def is
-  // inside. We find the outermost loop that the def is inside but a use is
-  // outside. Later we will spot any WWM code inside that loop and then make
-  // the def a partial def so its liveness goes round the loop and through the
-  // WWM code.
-  if (Defs.size() == 1) {
-    auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
-    if (!Loop)
-      return;
-    bool IsLoopExit = false;
-    for (auto &Use : MRI->use_instructions(Reg)) {
-      auto UseBlock = Use.getParent();
-      if (Loop->contains(UseBlock))
-        continue;
-      IsLoopExit = true;
-      while (auto Parent = Loop->getParentLoop()) {
-        if (Parent->contains(UseBlock))
-          break;
-        Loop = Parent;
-      }
-    }
-    if (!IsLoopExit)
-      return;
-    LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
-        << " is a loop exit reg with loop header at "
-        << "bb." << Loop->getHeader()->getNumber() << "\n");
-    LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
-            &DefOpnd, Loop));
-    return;
-  }
-  // Check for the case of a lowered single-preheader-loop phi, that is, a
-  // multi-def register where the dominating def is in the loop pre-header and
-  // all other defs are in backedges. Later we will spot any WWM code inside
-  // that loop and then make the backedge defs partial defs so the liveness
-  // goes through the WWM code.
-  // Note that we are ignoring multi-preheader loops on the basis that the
-  // structurizer does not allow that for non-uniform loops.
-  // There must be a single use in the loop header.
-  if (!MRI->hasOneUse(Reg))
-    return;
-  auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
-  auto Loop = LoopInfo->getLoopFor(UseBlock);
-  if (!Loop || Loop->getHeader() != UseBlock
-      || Loop->contains(Defs[0]->getParent())) {
-    LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
-        << " is multi-def but single use not in loop header\n");
-    return;
-  }
-  for (unsigned I = 1; I != Defs.size(); ++I) {
-    if (!Loop->contains(Defs[I]->getParent()))
-      return;
-  }
-  LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
-      << " is a loop phi reg with loop header at "
-      << "bb." << Loop->getHeader()->getNumber() << "\n");
-  LoopPhiDefs.push_back(
-      std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
-}
-
-// Process a then phi def: It has two defs, one dominates the other, and there
-// is a single use in a successor of the dominant def. Here we spot any WWM
-// code inside the "then" clause and turn the second def into a partial def so
-// its liveness goes through the WWM code in the "then" clause.
-bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
-  LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
-  if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
-    // Ignore if dominating def is undef.
-    LLVM_DEBUG(dbgs() << "  ignoring as dominating def is undef\n");
-    return false;
-  }
-  unsigned Reg = DefOpnd->getReg();
-  // Get the use block, which is the endif block.
-  auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
-  // Check whether there is WWM code inside the then branch. The WWM code must
-  // be dominated by the if but not dominated by the endif.
-  bool ContainsWWM = false;
-  for (auto WWM : WWMs) {
-    if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
-        && !DomTree->dominates(UseBlock, WWM->getParent())) {
-      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
-      ContainsWWM = true;
-      break;
-    }
-  }
-  if (!ContainsWWM)
-    return false;
-  // Get the other def.
-  MachineInstr *OtherDef = nullptr;
-  for (auto &MI : MRI->def_instructions(Reg)) {
-    if (&MI != DefOpnd->getParent())
-      OtherDef = &MI;
-  }
-  // Make it a partial def.
-  OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
-  LLVM_DEBUG(dbgs() << *OtherDef);
-  return true;
-}
-
-// Process a loop exit def, that is, a register with a single use in a loop
-// that has a use outside the loop.  Here we spot any WWM code inside that loop
-// and then make the def a partial def so its liveness goes round the loop and
-// through the WWM code.
-bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
-      MachineLoop *Loop) {
-  LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
-  // Check whether there is WWM code inside the loop.
-  bool ContainsWWM = false;
-  for (auto WWM : WWMs) {
-    if (Loop->contains(WWM->getParent())) {
-      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
-      ContainsWWM = true;
-      break;
-    }
-  }
-  if (!ContainsWWM)
-    return false;
-  unsigned Reg = DefOpnd->getReg();
-  // Add a new implicit_def in loop preheader(s).
-  for (auto Pred : Loop->getHeader()->predecessors()) {
-    if (!Loop->contains(Pred)) {
-      auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
-          TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
-      LLVM_DEBUG(dbgs() << *ImplicitDef);
-      (void)ImplicitDef;
-    }
-  }
-  // Make the original def partial.
-  DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
-          Reg, false, /*isImp=*/true));
-  LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
-  return true;
-}
-
-// Process a loop phi def, that is, a multi-def register where the dominating
-// def is in the loop pre-header and all other defs are in backedges. Here we
-// spot any WWM code inside that loop and then make the backedge defs partial
-// defs so the liveness goes through the WWM code.
-bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
-      MachineLoop *Loop) {
-  LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
-  // Check whether there is WWM code inside the loop.
-  bool ContainsWWM = false;
-  for (auto WWM : WWMs) {
-    if (Loop->contains(WWM->getParent())) {
-      LLVM_DEBUG(dbgs() << "  contains WWM: " << *WWM);
-      ContainsWWM = true;
-      break;
-    }
-  }
-  if (!ContainsWWM)
-    return false;
-  unsigned Reg = DefOpnd->getReg();
-  // Remove kill mark from uses.
-  for (auto &Use : MRI->use_operands(Reg))
-    Use.setIsKill(false);
-  // Make all defs except the dominating one partial defs.
-  SmallVector<MachineInstr *, 4> Defs;
-  for (auto &Def : MRI->def_instructions(Reg))
-    Defs.push_back(&Def);
-  for (auto Def : Defs) {
-    if (DefOpnd->getParent() == Def)
-      continue;
-    Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
-    LLVM_DEBUG(dbgs() << *Def);
-  }
-  return true;
-}
-
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -457,7 +457,12 @@ bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,

  const MachineRegisterInfo &MRI =
      FirstLdSt.getParent()->getParent()->getRegInfo();
-  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+
+  const unsigned Reg = FirstDst->getReg();
+
+  const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg)
+                                         ? MRI.getRegClass(Reg)
+                                         : RI.getPhysRegClass(Reg);

  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
 }
@ -1322,9 +1327,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
    MI.eraseFromParent();
    break;
  }
+  case AMDGPU::ENTER_WWM: {
+    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+    // WWM is entered.
+    MI.setDesc(get(AMDGPU::S_OR_SAVEEXEC_B64));
+    break;
+  }
  case AMDGPU::EXIT_WWM: {
-    // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
-    // is exited.
+    // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
+    // WWM is exited.
    MI.setDesc(get(AMDGPU::S_MOV_B64));
    break;
  }
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@ -121,6 +121,13 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]

+def ENTER_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins i64imm:$src0)> {
+  let Defs = [EXEC];
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
 def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
  let hasSideEffects = 0;
  let mayLoad = 0;
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -22,6 +22,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@ -259,6 +260,10 @@ public:
    SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
  };

+  SparseBitVector<> WWMReservedRegs;
+
+  void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
+
 private:
  // SGPR->VGPR spilling support.
  using SpillRegMask = std::pair<unsigned, unsigned>;
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@ -0,0 +1,221 @@
+//===- SIPreAllocateWWMRegs.cpp - WWM Register Pre-allocation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to pre-allocated WWM registers
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-allocate-wwm-regs"
+
+namespace {
+
+class SIPreAllocateWWMRegs : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+  LiveRegMatrix *Matrix;
+  VirtRegMap *VRM;
+  RegisterClassInfo RegClassInfo;
+
+  std::vector<unsigned> RegsToRewrite;
+
+public:
+  static char ID;
+
+  SIPreAllocateWWMRegs() : MachineFunctionPass(ID) {
+    initializeSIPreAllocateWWMRegsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<VirtRegMap>();
+    AU.addRequired<LiveRegMatrix>();
+    AU.addPreserved<SlotIndexes>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool processDef(MachineOperand &MO);
+  void rewriteRegs(MachineFunction &MF);
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIPreAllocateWWMRegs, DEBUG_TYPE,
+                "SI Pre-allocate WWM Registers", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(SIPreAllocateWWMRegs, DEBUG_TYPE,
+                "SI Pre-allocate WWM Registers", false, false)
+
+char SIPreAllocateWWMRegs::ID = 0;
+
+char &llvm::SIPreAllocateWWMRegsID = SIPreAllocateWWMRegs::ID;
+
+FunctionPass *llvm::createSIPreAllocateWWMRegsPass() {
+  return new SIPreAllocateWWMRegs();
+}
+
+bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+
+  unsigned Reg = MO.getReg();
+
+  if (!TRI->isVGPR(*MRI, Reg))
+    return false;
+
+  if (TRI->isPhysicalRegister(Reg))
+    return false;
+
+  if (VRM->hasPhys(Reg))
+    return false;
+
+  LiveInterval &LI = LIS->getInterval(Reg);
+
+  for (unsigned PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
+    if (!MRI->isPhysRegUsed(PhysReg) &&
+        Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
+      Matrix->assign(LI, PhysReg);
+      assert(PhysReg != 0);
+      RegsToRewrite.push_back(Reg);
+      return true;
+    }
+  }
+
+  llvm_unreachable("physreg not found for WWM expression");
+  return false;
+}
+
+void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+
+        const unsigned VirtReg = MO.getReg();
+        if (TRI->isPhysicalRegister(VirtReg))
+          continue;
+
+        if (!VRM->hasPhys(VirtReg))
+          continue;
+
+        unsigned PhysReg = VRM->getPhys(VirtReg);
+        const unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0) {
+          PhysReg = TRI->getSubReg(PhysReg, SubReg);
+          MO.setSubReg(0);
+        }
+
+        MO.setReg(PhysReg);
+        MO.setIsRenamable(false);
+      }
+    }
+  }
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  for (unsigned Reg : RegsToRewrite) {
+    LIS->removeInterval(Reg);
+
+    const unsigned PhysReg = VRM->getPhys(Reg);
+    assert(PhysReg != 0);
+    MFI->ReserveWWMRegister(PhysReg);
+  }
+
+  RegsToRewrite.clear();
+
+  // Update the set of reserved registers to include WWM ones.
+  MRI->freezeReservedRegs(MF);
+}
+
+bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n");
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  LIS = &getAnalysis<LiveIntervals>();
+  Matrix = &getAnalysis<LiveRegMatrix>();
+  VRM = &getAnalysis<VirtRegMap>();
+
+  RegClassInfo.runOnMachineFunction(MF);
+
+  bool RegsAssigned = false;
+
+  // We use a reverse post-order traversal of the control-flow graph to
+  // guarantee that we visit definitions in dominance order. Since WWM
+  // expressions are guaranteed to never involve phi nodes, and we can only
+  // escape WWM through the special WWM instruction, this means that this is a
+  // perfect elimination order, so we can never do any better.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+
+  for (MachineBasicBlock *MBB : RPOT) {
+    bool InWWM = false;
+    for (MachineInstr &MI : *MBB) {
+      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+          MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+        RegsAssigned |= processDef(MI.getOperand(0));
+
+      if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
+        LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
+        InWWM = true;
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+        LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
+        InWWM = false;
+      }
+
+      if (!InWWM)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "processing " << MI << "\n");
+
+      for (MachineOperand &DefOpnd : MI.defs()) {
+        RegsAssigned |= processDef(DefOpnd);
+      }
+    }
+  }
+
+  if (!RegsAssigned)
+    return false;
+
+  rewriteRegs(MF);
+  return true;
+}
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -230,6 +230,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
    assert(!isSubRegister(ScratchRSrcReg, FrameReg));
  }

+  for (unsigned Reg : MFI->WWMReservedRegs) {
+    reserveRegisterTuples(Reserved, Reg);
+  }
+
  return Reserved;
 }

--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@ -656,8 +656,7 @@ void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
  MachineInstr *MI;

  assert(SaveOrig);
-  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
-               SaveOrig)
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
           .addImm(-1);
  LIS->InsertMachineInstrInMaps(*MI);
 }
@ -839,7 +838,23 @@ void SIWholeQuadMode::lowerCopyInstrs() {
  for (MachineInstr *MI : LowerToCopyInstrs) {
    for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
      MI->RemoveOperand(i);
-    MI->setDesc(TII->get(AMDGPU::COPY));
+
+    const unsigned Reg = MI->getOperand(0).getReg();
+
+    if (TRI->isVGPR(*MRI, Reg)) {
+      const TargetRegisterClass *regClass =
+          TargetRegisterInfo::isVirtualRegister(Reg)
+              ? MRI->getRegClass(Reg)
+              : TRI->getPhysRegClass(Reg);
+
+      const unsigned MovOp = TII->getMovOpcode(regClass);
+      MI->setDesc(TII->get(MovOp));
+
+      // And make it implicitly depend on exec (like all VALU movs should do).
+      MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+    } else {
+      MI->setDesc(TII->get(AMDGPU::COPY));
+    }
  }
 }

--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@ -112,7 +112,7 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf
@ -120,8 +120,7 @@ entry:
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
 ; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

 declare i32 @llvm.amdgcn.workitem.id.x()

@ -133,9 +133,7 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@ -136,9 +136,7 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
 define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@ -104,9 +104,7 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@ -117,9 +117,7 @@ entry:
 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
 ; GFX7LESS-NOT: s_bcnt1_i32_b64
 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
-; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[sub_value:[0-9]+]] wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8MORE: v_sub_u32_e32 v[[sub_value]],{{( vcc,)?}} v[[sub_value]], v{{[0-9]+}}
-; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v[[sub_value]], 63
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GFX8MORE: buffer_atomic_sub v[[value]]
 define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
@ -1,185 +0,0 @@
-# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o -  %s | FileCheck %s
-
-# Test a then phi value.
-#CHECK: test_wwm_liveness_then_phi
-#CHECK: %21:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, implicit %21
-
---
-name:            test_wwm_liveness_then_phi
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: sreg_64, preferred-register: '' }
-  - { id: 1, class: sgpr_32, preferred-register: '' }
-  - { id: 2, class: sgpr_32, preferred-register: '' }
-  - { id: 3, class: vgpr_32, preferred-register: '' }
-  - { id: 4, class: vgpr_32, preferred-register: '' }
-  - { id: 5, class: vgpr_32, preferred-register: '' }
-  - { id: 6, class: vgpr_32, preferred-register: '' }
-  - { id: 7, class: vgpr_32, preferred-register: '' }
-  - { id: 8, class: sreg_64, preferred-register: '$vcc' }
-  - { id: 9, class: sreg_64, preferred-register: '' }
-  - { id: 10, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 11, class: sreg_64, preferred-register: '' }
-  - { id: 12, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 13, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 14, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 15, class: sreg_128, preferred-register: '' }
-  - { id: 16, class: vgpr_32, preferred-register: '' }
-  - { id: 17, class: vgpr_32, preferred-register: '' }
-  - { id: 18, class: vgpr_32, preferred-register: '' }
-  - { id: 19, class: sreg_64, preferred-register: '' }
-  - { id: 20, class: sreg_64, preferred-register: '' }
-  - { id: 21, class: vgpr_32, preferred-register: '' }
-  - { id: 22, class: sreg_64, preferred-register: '' }
-  - { id: 23, class: sreg_64, preferred-register: '' }
-liveins:
-body:             |
-  bb.0:
-    successors: %bb.1(0x40000000), %bb.2(0x40000000)
-
-    %21 = V_MOV_B32_e32 0, implicit $exec
-    %5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec
-    %6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit $exec
-    %8 = V_CMP_GT_U32_e64 32, killed %6, implicit $exec
-    %22 = COPY $exec, implicit-def $exec
-    %23 = S_AND_B64 %22, %8, implicit-def dead $scc
-    %0 = S_XOR_B64 %23, %22, implicit-def dead $scc
-    $exec = S_MOV_B64_term killed %23
-    SI_MASK_BRANCH %bb.2, implicit $exec
-    S_BRANCH %bb.1
-
-  bb.1:
-    successors: %bb.2(0x80000000)
-
-    %13 = S_MOV_B32 61440
-    %14 = S_MOV_B32 -1
-    %15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4
-    %19 = COPY $exec
-    $exec = S_MOV_B64 -1
-    %16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4)
-    %17 = V_ADD_F32_e32 1065353216, killed %16, implicit $exec
-    $exec = EXIT_WWM killed %19
-    %21 = V_MOV_B32_e32 1, implicit $exec
-    early-clobber %18 = WWM killed %17, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit $exec :: (store 4)
-
-  bb.2:
-    $exec = S_OR_B64 $exec, killed %0, implicit-def $scc
-    $vgpr0 = COPY killed %21
-    SI_RETURN_TO_EPILOG killed $vgpr0
-
-...
-
-# Test a loop with a loop exit value and a loop phi.
-#CHECK: test_wwm_liveness_loop
-#CHECK: %4:vgpr_32 = IMPLICIT_DEF
-#CHECK: bb.1:
-#CHECK: %4:vgpr_32 = FLAT_LOAD_DWORD{{.*}}, implicit %4
-#CHECK: %27:vgpr_32 = COPY killed %21, implicit %27
-
---
-name:            test_wwm_liveness_loop
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '' }
-  - { id: 1, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 2, class: sreg_64, preferred-register: '' }
-  - { id: 3, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 4, class: vgpr_32, preferred-register: '' }
-  - { id: 5, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 6, class: sreg_64, preferred-register: '' }
-  - { id: 7, class: sreg_64, preferred-register: '' }
-  - { id: 8, class: sreg_64, preferred-register: '' }
-  - { id: 9, class: vreg_64, preferred-register: '' }
-  - { id: 10, class: vgpr_32, preferred-register: '' }
-  - { id: 11, class: vgpr_32, preferred-register: '' }
-  - { id: 12, class: vgpr_32, preferred-register: '' }
-  - { id: 13, class: sreg_64, preferred-register: '' }
-  - { id: 14, class: vreg_64, preferred-register: '' }
-  - { id: 15, class: sreg_32_xm0, preferred-register: '' }
-  - { id: 16, class: vgpr_32, preferred-register: '' }
-  - { id: 17, class: sreg_64, preferred-register: '$vcc' }
-  - { id: 18, class: vgpr_32, preferred-register: '' }
-  - { id: 19, class: vgpr_32, preferred-register: '' }
-  - { id: 20, class: vgpr_32, preferred-register: '' }
-  - { id: 21, class: vgpr_32, preferred-register: '' }
-  - { id: 22, class: vgpr_32, preferred-register: '' }
-  - { id: 23, class: sreg_64, preferred-register: '' }
-  - { id: 24, class: sreg_64, preferred-register: '' }
-  - { id: 25, class: sreg_64, preferred-register: '' }
-  - { id: 26, class: sreg_64, preferred-register: '' }
-  - { id: 27, class: vgpr_32, preferred-register: '' }
-liveins:
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:
-stack:
-constants:
-body:             |
-  bb.0:
-    successors: %bb.1(0x80000000)
-
-    %25:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %0:vgpr_32 = FLAT_LOAD_DWORD undef %9:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
-    $exec = EXIT_WWM killed %25
-    %12:vgpr_32 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit $exec
-    %7:sreg_64 = S_MOV_B64 0
-    %26:sreg_64 = COPY killed %7
-    %27:vgpr_32 = COPY killed %12
-
-  bb.1:
-    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-
-    %24:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %20:vgpr_32 = COPY killed %27
-    %2:sreg_64 = COPY killed %26
-    %4:vgpr_32 = FLAT_LOAD_DWORD undef %14:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1)
-    $exec = EXIT_WWM killed %24
-    %22:vgpr_32 = V_ADD_I32_e32 -1, killed %20, implicit-def dead $vcc, implicit $exec
-    %17:sreg_64 = V_CMP_EQ_U32_e64 0, %22, implicit $exec
-    %6:sreg_64 = S_OR_B64 killed %17, killed %2, implicit-def $scc
-    %21:vgpr_32 = COPY killed %22
-    %26:sreg_64 = COPY %6
-    %27:vgpr_32 = COPY killed %21
-    $exec = S_ANDN2_B64_term $exec, %6, implicit-def $scc
-    S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    S_BRANCH %bb.2
-
-  bb.2:
-    $exec = S_OR_B64 $exec, killed %6, implicit-def $scc
-    %23:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %18:vgpr_32 = V_ADD_F32_e32 killed %0, killed %4, implicit $exec
-    $exec = EXIT_WWM killed %23
-    early-clobber %19:vgpr_32 = COPY killed %18, implicit $exec
-    $vgpr0 = COPY killed %19
-    SI_RETURN_TO_EPILOG killed $vgpr0
-
-...
-
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@ -81,7 +81,6 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
  ; GCN: bb.1:
  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
  ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 8 from %stack.5, align 4, addrspace 5)
-  ; GCN:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
  ; GCN:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
  ; GCN:   renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
  ; GCN:   renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec
@ -93,9 +92,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
  ; GCN:   renamable $vgpr19 = COPY renamable $vgpr18
  ; GCN:   renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5
  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.5, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (store 8 into %stack.6, align 4, addrspace 5)
  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
-  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5)
  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
  ; GCN:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
  ; GCN:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@ -3,7 +3,7 @@
 ---
 # Check for awareness that s_or_saveexec_b64 clobbers SCC
 #
-#CHECK: S_OR_SAVEEXEC_B64
+#CHECK: ENTER_WWM
 #CHECK: S_CMP_LT_I32
 #CHECK: S_CSELECT_B32
 name:            test_wwm_scc
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@ -0,0 +1,188 @@
+; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O3 %s
+
+define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) {
+  %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
+  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
+  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
+  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
+  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
+  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
+
+; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
+; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
+  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
+  %tmp121 = add i32 %tmp105, %tmp120
+  %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
+
+; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
+; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
+  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
+  %tmp136 = add i32 %tmp107, %tmp135
+  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
+
+; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+  %tmp138 = icmp eq i32 %tmp122, %tmp137
+  %tmp139 = sext i1 %tmp138 to i32
+  %tmp140 = shl nsw i32 %tmp139, 1
+  %tmp141 = and i32 %tmp140, 2
+  %tmp145 = bitcast i32 %tmp141 to float
+  call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @cfg(<4 x i32> inreg %tmp14, i32 %arg) {
+entry:
+  %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
+  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
+  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
+  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
+
+; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
+; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
+; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET:[0-9]+]] offset:[[FIRST_IMM_OFFSET:[0-9]+]]
+  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
+  %tmp121 = add i32 %tmp105, %tmp120
+  %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
+
+  %cond = icmp eq i32 %arg, 0
+  br i1 %cond, label %if, label %merge
+if:
+  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
+  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
+
+; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
+; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
+; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET:[0-9]+]] offset:[[SECOND_IMM_OFFSET:[0-9]+]]
+  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
+  %tmp136 = add i32 %tmp107, %tmp135
+  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
+  br label %merge
+
+merge:
+  %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
+; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET]] offset:[[SECOND_IMM_OFFSET]]
+; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET]] offset:[[FIRST_IMM_OFFSET]]
+; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+  %tmp138 = icmp eq i32 %tmp122, %merge_value
+  %tmp139 = sext i1 %tmp138 to i32
+  %tmp140 = shl nsw i32 %tmp139, 1
+  %tmp141 = and i32 %tmp140, 2
+  %tmp145 = bitcast i32 %tmp141 to float
+  call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
+  ret void
+}
+
+define i32 @called(i32 %a) noinline {
+; GFX9: v_add_u32_e32 v1, v0, v0
+  %add = add i32 %a, %a
+; GFX9: v_mul_lo_i32 v0, v1, v0
+  %mul = mul i32 %add, %a
+; GFX9: v_sub_u32_e32 v0, v0, v1
+  %sub = sub i32 %mul, %add
+  ret i32 %sub
+}
+
+define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
+; GFX9-O0: v_mov_b32_e32 v0, s2
+; GFX9-O3: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_not_b64 exec, exec
+  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
+; GFX9-O0: buffer_store_dword v0
+; GFX9-O3: v_mov_b32_e32 v0, v2
+; GFX9: s_swappc_b64
+  %tmp134 = call i32 @called(i32 %tmp107)
+; GFX9-O0: buffer_load_dword v1
+; GFX9-O3: v_mov_b32_e32 v1, v0
+; GFX9-O0: v_add_u32_e32 v0, v0, v1
+; GFX9-O3: v_add_u32_e32 v1, v1, v2
+  %tmp136 = add i32 %tmp134, %tmp107
+  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
+; GFX9-O0: buffer_store_dword v2
+; GFX9-O3: buffer_store_dword v0
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
+  ret void
+}
+
+define i64 @called_i64(i64 %a) noinline {
+  %add = add i64 %a, %a
+  %mul = mul i64 %add, %a
+  %sub = sub i64 %mul, %add
+  ret i64 %sub
+}
+
+define amdgpu_kernel void @call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) {
+; GFX9-O0: v_mov_b32_e32 v0, s0
+; GFX9-O0: v_mov_b32_e32 v1, s1
+; GFX9-O3: v_mov_b32_e32 v7, s1
+; GFX9-O3: v_mov_b32_e32 v6, s0
+; GFX9-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_not_b64 exec, exec
+  %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
+; GFX9-O0: buffer_store_dword v0
+; GFX9-O0: buffer_store_dword v1
+; GFX9: s_swappc_b64
+  %tmp134 = call i64 @called_i64(i64 %tmp107)
+; GFX9-O0: buffer_load_dword v3
+; GFX9-O0: buffer_load_dword v4
+  %tmp136 = add i64 %tmp134, %tmp107
+  %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136)
+  %tmp138 = bitcast i64 %tmp137 to <2 x i32>
+  call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
+  %tmp17 = shl i32 %index, 5
+; GFX9: buffer_load_dwordx4
+  %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
+  %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
+  %tmp19 = or i32 %tmp17, 16
+; GFX9: buffer_load_dwordx2
+  %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
+  %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
+  %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)
+  %tmp97 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp22)
+  %.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1
+  %tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807)
+  %tmp174 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp99)
+  %.i25 = bitcast <2 x i32> %tmp20 to i64
+  %tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807)
+  %tmp251 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp176)
+  %.cast = bitcast i64 %tmp97 to <2 x float>
+  %.cast6 = bitcast i64 %tmp174 to <2 x float>
+  %.cast7 = bitcast i64 %tmp251 to <2 x float>
+  %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9: buffer_store_dwordx4
+  tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, <4 x i32> %desc, i32 %tmp17, i32 0, i32 0)
+  ; GFX9: buffer_store_dwordx2
+  tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, <4 x i32> %desc, i32 %tmp19, i32 0, i32 0)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wwm.i32(i32)
+declare i64 @llvm.amdgcn.wwm.i64(i64)
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
+declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
+declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
+declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) 
+declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)