[mips] Fix atomic operations at O0, v3

Similar to PR/25526, fast-regalloc introduces spills at the end of basic blocks. When this occurs in between an ll and sc, the stores can cause the atomic sequence to fail. This patch fixes the issue by introducing more pseudos to represent atomic operations and moving their lowering to after the expansion of postRA pseudos. This version addresses issues with the initial implementation and covers all atomic operations. This resolves PR/32020. Thanks to James Cowgill for reporting the issue! Patch By: Simon Dardis Differential Revision: https://reviews.llvm.org/D31287 llvm-svn: 336328
2018-07-05 09:27:05 +00:00 · 2018-07-05 09:27:05 +00:00 · 3239ba8c0e
parent b41c61eed4
commit 3239ba8c0e
12 changed files with 10087 additions and 757 deletions
--- a/llvm/lib/Target/Mips/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/CMakeLists.txt
@ -30,6 +30,7 @@ add_llvm_target(MipsCodeGen
  MipsCCState.cpp
  MipsConstantIslandPass.cpp
  MipsDelaySlotFiller.cpp
+  MipsExpandPseudo.cpp
  MipsFastISel.cpp
  MipsInstrInfo.cpp
  MipsInstructionSelector.cpp
--- a/llvm/lib/Target/Mips/Mips.h
+++ b/llvm/lib/Target/Mips/Mips.h
@ -37,6 +37,7 @@ namespace llvm {
  FunctionPass *createMipsBranchExpansion();
  FunctionPass *createMipsConstantIslandPass();
  FunctionPass *createMicroMipsSizeReducePass();
+  FunctionPass *createMipsExpandPseudoPass();

  InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &,
                                                     MipsSubtarget &,
--- a/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips64InstrInfo.td
@ -85,6 +85,17 @@ let usesCustomInserter = 1 in {
  def ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
 }

+def ATOMIC_LOAD_ADD_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_SUB_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_AND_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_OR_I64_POSTRA   : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_XOR_I64_POSTRA  : Atomic2OpsPostRA<GPR64>;
+def ATOMIC_LOAD_NAND_I64_POSTRA : Atomic2OpsPostRA<GPR64>;
+
+def ATOMIC_SWAP_I64_POSTRA      : Atomic2OpsPostRA<GPR64>;
+
+def ATOMIC_CMP_SWAP_I64_POSTRA  : AtomicCmpSwapPostRA<GPR64>;
+
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
  def LOAD_ACC128  : Load<"", ACC128>;
--- a/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
+++ b/llvm/lib/Target/Mips/MipsExpandPseudo.cpp
@ -0,0 +1,702 @@
+//===-- MipsExpandPseudoInsts.cpp - Expand pseudo instructions ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+// This is currently only used for expanding atomic pseudos after register
+// allocation. We do this to avoid the fast register allocator introducing
+// spills between ll and sc. These stores cause some MIPS implementations to
+// abort the atomic RMW sequence.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-pseudo"
+
+namespace {
+  class MipsExpandPseudo : public MachineFunctionPass {
+  public:
+    static char ID;
+    MipsExpandPseudo() : MachineFunctionPass(ID) {}
+
+    const MipsInstrInfo *TII;
+    const MipsSubtarget *STI;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "Mips pseudo instruction expansion pass";
+    }
+
+  private:
+    bool expandAtomicCmpSwap(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             MachineBasicBlock::iterator &NextMBBI);
+    bool expandAtomicCmpSwapSubword(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    MachineBasicBlock::iterator &NextMBBI);
+
+    bool expandAtomicBinOp(MachineBasicBlock &BB,
+                           MachineBasicBlock::iterator I,
+                           MachineBasicBlock::iterator &NMBBI, unsigned Size);
+    bool expandAtomicBinOpSubword(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  MachineBasicBlock::iterator &NMBBI);
+
+    bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  MachineBasicBlock::iterator &NMBB);
+    bool expandMBB(MachineBasicBlock &MBB);
+   };
+  char MipsExpandPseudo::ID = 0;
+}
+
+bool MipsExpandPseudo::expandAtomicCmpSwapSubword(
+    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator &NMBBI) {
+
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+  unsigned LL, SC;
+
+  unsigned ZERO = Mips::ZERO;
+  unsigned BNE = Mips::BNE;
+  unsigned BEQ = Mips::BEQ;
+  unsigned SEOp =
+      I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I8_POSTRA ? Mips::SEB : Mips::SEH;
+
+  if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BNE = STI->hasMips32r6() ? Mips::BNEC_MMR6 : Mips::BNE_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+  } else {
+    LL = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                            : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                            : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
+
+  unsigned Dest = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned Mask = I->getOperand(2).getReg();
+  unsigned ShiftCmpVal = I->getOperand(3).getReg();
+  unsigned Mask2 = I->getOperand(4).getReg();
+  unsigned ShiftNewVal = I->getOperand(5).getReg();
+  unsigned ShiftAmnt = I->getOperand(6).getReg();
+  unsigned Scratch = I->getOperand(7).getReg();
+  unsigned Scratch2 = I->getOperand(8).getReg();
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), &BB,
+                  std::next(MachineBasicBlock::iterator(I)), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loop1MBB
+  BB.addSuccessor(loop1MBB, BranchProbability::getOne());
+  loop1MBB->addSuccessor(sinkMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop1MBB->normalizeSuccProbs();
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(sinkMBB);
+  loop2MBB->normalizeSuccProbs();
+  sinkMBB->addSuccessor(exitMBB, BranchProbability::getOne());
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   and Mask', dest, Mask
+  //   bne Mask', ShiftCmpVal, exitMBB
+  BuildMI(loop1MBB, DL, TII->get(LL), Scratch).addReg(Ptr).addImm(0);
+  BuildMI(loop1MBB, DL, TII->get(Mips::AND), Scratch2)
+      .addReg(Scratch)
+      .addReg(Mask);
+  BuildMI(loop1MBB, DL, TII->get(BNE))
+    .addReg(Scratch2).addReg(ShiftCmpVal).addMBB(sinkMBB);
+
+  // loop2MBB:
+  //   and dest, dest, mask2
+  //   or dest, dest, ShiftNewVal
+  //   sc dest, dest, 0(ptr)
+  //   beq dest, $0, loop1MBB
+  BuildMI(loop2MBB, DL, TII->get(Mips::AND), Scratch)
+      .addReg(Scratch, RegState::Kill)
+      .addReg(Mask2);
+  BuildMI(loop2MBB, DL, TII->get(Mips::OR), Scratch)
+      .addReg(Scratch, RegState::Kill)
+      .addReg(ShiftNewVal);
+  BuildMI(loop2MBB, DL, TII->get(SC), Scratch)
+      .addReg(Scratch, RegState::Kill)
+      .addReg(Ptr)
+      .addImm(0);
+  BuildMI(loop2MBB, DL, TII->get(BEQ))
+      .addReg(Scratch, RegState::Kill)
+      .addReg(ZERO)
+      .addMBB(loop1MBB);
+
+  //  sinkMBB:
+  //    srl     srlres, Mask', shiftamt
+  //    sign_extend dest,srlres
+  BuildMI(sinkMBB, DL, TII->get(Mips::SRLV), Dest)
+      .addReg(Scratch2)
+      .addReg(ShiftAmnt);
+  if (STI->hasMips32r2()) {
+    BuildMI(sinkMBB, DL, TII->get(SEOp), Dest).addReg(Dest);
+  } else {
+    const unsigned ShiftImm =
+        I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I16_POSTRA ? 16 : 24;
+    BuildMI(sinkMBB, DL, TII->get(Mips::SLL), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+    BuildMI(sinkMBB, DL, TII->get(Mips::SRA), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+  }
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loop1MBB);
+  computeAndAddLiveIns(LiveRegs, *loop2MBB);
+  computeAndAddLiveIns(LiveRegs, *sinkMBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+  return true;
+}
+
+bool MipsExpandPseudo::expandAtomicCmpSwap(MachineBasicBlock &BB,
+                                           MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator &NMBBI) {
+
+  const unsigned Size =
+      I->getOpcode() == Mips::ATOMIC_CMP_SWAP_I32_POSTRA ? 4 : 8;
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+
+  unsigned LL, SC, ZERO, BNE, BEQ, MOVE;
+
+  if (Size == 4) {
+    if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BNE = STI->hasMips32r6() ? Mips::BNEC_MMR6 : Mips::BNE_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+    } else {
+      LL = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+      BNE = Mips::BNE;
+      BEQ = Mips::BEQ;
+    }
+
+    ZERO = Mips::ZERO;
+    MOVE = Mips::OR;
+  } else {
+    LL = STI->hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = STI->hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+    ZERO = Mips::ZERO_64;
+    BNE = Mips::BNE64;
+    BEQ = Mips::BEQ64;
+    MOVE = Mips::OR64;
+  }
+
+  unsigned Dest = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned OldVal = I->getOperand(2).getReg();
+  unsigned NewVal = I->getOperand(3).getReg();
+  unsigned Scratch = I->getOperand(4).getReg();
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), &BB,
+                  std::next(MachineBasicBlock::iterator(I)), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loop1MBB
+  BB.addSuccessor(loop1MBB, BranchProbability::getOne());
+  loop1MBB->addSuccessor(exitMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop1MBB->normalizeSuccProbs();
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(exitMBB);
+  loop2MBB->normalizeSuccProbs();
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   bne dest, oldval, exitMBB
+  BuildMI(loop1MBB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+  BuildMI(loop1MBB, DL, TII->get(BNE))
+    .addReg(Dest, RegState::Kill).addReg(OldVal).addMBB(exitMBB);
+
+  // loop2MBB:
+  //   move scratch, NewVal
+  //   sc Scratch, Scratch, 0(ptr)
+  //   beq Scratch, $0, loop1MBB
+  BuildMI(loop2MBB, DL, TII->get(MOVE), Scratch).addReg(NewVal).addReg(ZERO);
+  BuildMI(loop2MBB, DL, TII->get(SC), Scratch)
+    .addReg(Scratch).addReg(Ptr).addImm(0);
+  BuildMI(loop2MBB, DL, TII->get(BEQ))
+    .addReg(Scratch, RegState::Kill).addReg(ZERO).addMBB(loop1MBB);
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loop1MBB);
+  computeAndAddLiveIns(LiveRegs, *loop2MBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+  return true;
+}
+
+bool MipsExpandPseudo::expandAtomicBinOpSubword(
+    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator &NMBBI) {
+
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+
+  unsigned LL, SC;
+  unsigned BEQ = Mips::BEQ;
+  unsigned SEOp = Mips::SEH;
+
+  if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+  } else {
+    LL = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                            : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = STI->hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                            : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
+
+  bool IsSwap = false;
+  bool IsNand = false;
+
+  unsigned Opcode = 0;
+  switch (I->getOpcode()) {
+  case Mips::ATOMIC_LOAD_NAND_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_NAND_I16_POSTRA:
+    IsNand = true;
+    break;
+  case Mips::ATOMIC_SWAP_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_SWAP_I16_POSTRA:
+    IsSwap = true;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_ADD_I16_POSTRA:
+    Opcode = Mips::ADDu;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_SUB_I16_POSTRA:
+    Opcode = Mips::SUBu;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_AND_I16_POSTRA:
+    Opcode = Mips::AND;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_OR_I16_POSTRA:
+    Opcode = Mips::OR;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I8_POSTRA:
+    SEOp = Mips::SEB;
+    LLVM_FALLTHROUGH;
+  case Mips::ATOMIC_LOAD_XOR_I16_POSTRA:
+    Opcode = Mips::XOR;
+    break;
+  default:
+    llvm_unreachable("Unknown subword atomic pseudo for expansion!");
+  }
+
+  unsigned Dest = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned Incr = I->getOperand(2).getReg();
+  unsigned Mask = I->getOperand(3).getReg();
+  unsigned Mask2 = I->getOperand(4).getReg();
+  unsigned ShiftAmnt = I->getOperand(5).getReg();
+  unsigned OldVal = I->getOperand(6).getReg();
+  unsigned BinOpRes = I->getOperand(7).getReg();
+  unsigned StoreVal = I->getOperand(8).getReg();
+
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loopMBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  BB.addSuccessor(loopMBB, BranchProbability::getOne());
+  loopMBB->addSuccessor(sinkMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->normalizeSuccProbs();
+
+  BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  if (IsNand) {
+    //  and andres, oldval, incr2
+    //  nor binopres, $0, andres
+    //  and newval, binopres, mask
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(OldVal)
+        .addReg(Incr);
+    BuildMI(loopMBB, DL, TII->get(Mips::NOR), BinOpRes)
+        .addReg(Mips::ZERO)
+        .addReg(BinOpRes);
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(BinOpRes)
+        .addReg(Mask);
+  } else if (!IsSwap) {
+    //  <binop> binopres, oldval, incr2
+    //  and newval, binopres, mask
+    BuildMI(loopMBB, DL, TII->get(Opcode), BinOpRes)
+        .addReg(OldVal)
+        .addReg(Incr);
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(BinOpRes)
+        .addReg(Mask);
+  } else { // atomic.swap
+    //  and newval, incr2, mask
+    BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes)
+        .addReg(Incr)
+        .addReg(Mask);
+  }
+
+  // and StoreVal, OlddVal, Mask2
+  // or StoreVal, StoreVal, BinOpRes
+  // StoreVal<tied1> = sc StoreVal, 0(Ptr)
+  // beq StoreVal, zero, loopMBB
+  BuildMI(loopMBB, DL, TII->get(Mips::AND), StoreVal)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(loopMBB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(StoreVal).addReg(BinOpRes);
+  BuildMI(loopMBB, DL, TII->get(SC), StoreVal)
+    .addReg(StoreVal).addReg(Ptr).addImm(0);
+  BuildMI(loopMBB, DL, TII->get(BEQ))
+    .addReg(StoreVal).addReg(Mips::ZERO).addMBB(loopMBB);
+
+  //  sinkMBB:
+  //    and     maskedoldval1,oldval,mask
+  //    srl     srlres,maskedoldval1,shiftamt
+  //    sign_extend dest,srlres
+
+  sinkMBB->addSuccessor(exitMBB, BranchProbability::getOne());
+
+  BuildMI(sinkMBB, DL, TII->get(Mips::AND), Dest)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(sinkMBB, DL, TII->get(Mips::SRLV), Dest)
+      .addReg(Dest).addReg(ShiftAmnt);
+
+  if (STI->hasMips32r2()) {
+    BuildMI(sinkMBB, DL, TII->get(SEOp), Dest).addReg(Dest);
+  } else {
+    const unsigned ShiftImm = SEOp == Mips::SEH ? 16 : 24;
+    BuildMI(sinkMBB, DL, TII->get(Mips::SLL), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+    BuildMI(sinkMBB, DL, TII->get(Mips::SRA), Dest)
+        .addReg(Dest, RegState::Kill)
+        .addImm(ShiftImm);
+  }
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loopMBB);
+  computeAndAddLiveIns(LiveRegs, *sinkMBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+
+  return true;
+}
+
+bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         MachineBasicBlock::iterator &NMBBI,
+                                         unsigned Size) {
+  MachineFunction *MF = BB.getParent();
+
+  const bool ArePtrs64bit = STI->getABI().ArePtrs64bit();
+  DebugLoc DL = I->getDebugLoc();
+
+  unsigned LL, SC, ZERO, BEQ;
+
+  if (Size == 4) {
+    if (STI->inMicroMipsMode()) {
+      LL = STI->hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
+      SC = STI->hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
+      BEQ = STI->hasMips32r6() ? Mips::BEQC_MMR6 : Mips::BEQ_MM;
+    } else {
+      LL = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = STI->hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+      BEQ = Mips::BEQ;
+    }
+
+    ZERO = Mips::ZERO;
+  } else {
+    LL = STI->hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = STI->hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+    ZERO = Mips::ZERO_64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned OldVal = I->getOperand(0).getReg();
+  unsigned Ptr = I->getOperand(1).getReg();
+  unsigned Incr = I->getOperand(2).getReg();
+  unsigned Scratch = I->getOperand(3).getReg();
+
+  unsigned Opcode = 0;
+  unsigned OR = 0;
+  unsigned AND = 0;
+  unsigned NOR = 0;
+  bool IsNand = false;
+  switch (I->getOpcode()) {
+  case Mips::ATOMIC_LOAD_ADD_I32_POSTRA:
+    Opcode = Mips::ADDu;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I32_POSTRA:
+    Opcode = Mips::SUBu;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I32_POSTRA:
+    Opcode = Mips::AND;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I32_POSTRA:
+    Opcode = Mips::OR;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I32_POSTRA:
+    Opcode = Mips::XOR;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I32_POSTRA:
+    IsNand = true;
+    AND = Mips::AND;
+    NOR = Mips::NOR;
+    break;
+  case Mips::ATOMIC_SWAP_I32_POSTRA:
+    OR = Mips::OR;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I64_POSTRA:
+    Opcode = Mips::DADDu;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I64_POSTRA:
+    Opcode = Mips::DSUBu;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I64_POSTRA:
+    Opcode = Mips::AND64;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I64_POSTRA:
+    Opcode = Mips::OR64;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I64_POSTRA:
+    Opcode = Mips::XOR64;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I64_POSTRA:
+    IsNand = true;
+    AND = Mips::AND64;
+    NOR = Mips::NOR64;
+    break;
+  case Mips::ATOMIC_SWAP_I64_POSTRA:
+    OR = Mips::OR64;
+    break;
+  default:
+    llvm_unreachable("Unknown pseudo atomic!");
+  }
+
+  const BasicBlock *LLVM_BB = BB.getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB.getIterator();
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(&BB);
+
+  BB.addSuccessor(loopMBB, BranchProbability::getOne());
+  loopMBB->addSuccessor(exitMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->normalizeSuccProbs();
+
+  BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  assert((OldVal != Ptr) && "Clobbered the wrong ptr reg!");
+  assert((OldVal != Incr) && "Clobbered the wrong reg!");
+  if (Opcode) {
+    BuildMI(loopMBB, DL, TII->get(Opcode), Scratch).addReg(OldVal).addReg(Incr);
+  } else if (IsNand) {
+    assert(AND && NOR &&
+           "Unknown nand instruction for atomic pseudo expansion");
+    BuildMI(loopMBB, DL, TII->get(AND), Scratch).addReg(OldVal).addReg(Incr);
+    BuildMI(loopMBB, DL, TII->get(NOR), Scratch).addReg(ZERO).addReg(Scratch);
+  } else {
+    assert(OR && "Unknown instruction for atomic pseudo expansion!");
+    BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO);
+  }
+
+  BuildMI(loopMBB, DL, TII->get(SC), Scratch).addReg(Scratch).addReg(Ptr).addImm(0);
+  BuildMI(loopMBB, DL, TII->get(BEQ)).addReg(Scratch).addReg(ZERO).addMBB(loopMBB);
+
+  NMBBI = BB.end();
+  I->eraseFromParent();
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *loopMBB);
+  computeAndAddLiveIns(LiveRegs, *exitMBB);
+
+  return true;
+}
+
+bool MipsExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                MachineBasicBlock::iterator &NMBB) {
+
+  bool Modified = false;
+
+  switch (MBBI->getOpcode()) {
+  case Mips::ATOMIC_CMP_SWAP_I32_POSTRA:
+  case Mips::ATOMIC_CMP_SWAP_I64_POSTRA:
+    return expandAtomicCmpSwap(MBB, MBBI, NMBB);
+  case Mips::ATOMIC_CMP_SWAP_I8_POSTRA:
+  case Mips::ATOMIC_CMP_SWAP_I16_POSTRA:
+    return expandAtomicCmpSwapSubword(MBB, MBBI, NMBB);
+  case Mips::ATOMIC_SWAP_I8_POSTRA:
+  case Mips::ATOMIC_SWAP_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_ADD_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_ADD_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I16_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I8_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I16_POSTRA:
+    return expandAtomicBinOpSubword(MBB, MBBI, NMBB);
+  case Mips::ATOMIC_LOAD_ADD_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I32_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I32_POSTRA:
+  case Mips::ATOMIC_SWAP_I32_POSTRA:
+    return expandAtomicBinOp(MBB, MBBI, NMBB, 4);
+  case Mips::ATOMIC_LOAD_ADD_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_SUB_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_AND_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_OR_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_XOR_I64_POSTRA:
+  case Mips::ATOMIC_LOAD_NAND_I64_POSTRA:
+  case Mips::ATOMIC_SWAP_I64_POSTRA:
+    return expandAtomicBinOp(MBB, MBBI, NMBB, 8);
+  default:
+    return Modified;
+  }
+}
+
+bool MipsExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool MipsExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  STI = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  TII = STI->getInstrInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI)
+    Modified |= expandMBB(*MFI);
+
+  if (Modified)
+    MF.RenumberBlocks();
+
+  return Modified;
+}
+
+/// createMipsExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createMipsExpandPseudoPass() {
+  return new MipsExpandPseudo();
+}
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@ -1280,76 +1280,76 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
  default:
    llvm_unreachable("Unexpected instr type to insert");
  case Mips::ATOMIC_LOAD_ADD_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+    return emitAtomicBinaryPartword(MI, BB, 1);
  case Mips::ATOMIC_LOAD_ADD_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+    return emitAtomicBinaryPartword(MI, BB, 2);
  case Mips::ATOMIC_LOAD_ADD_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
+    return emitAtomicBinary(MI, BB);
  case Mips::ATOMIC_LOAD_ADD_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
+    return emitAtomicBinary(MI, BB);

  case Mips::ATOMIC_LOAD_AND_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+    return emitAtomicBinaryPartword(MI, BB, 1);
  case Mips::ATOMIC_LOAD_AND_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+    return emitAtomicBinaryPartword(MI, BB, 2);
  case Mips::ATOMIC_LOAD_AND_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::AND);
+    return emitAtomicBinary(MI, BB);
  case Mips::ATOMIC_LOAD_AND_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::AND64);
+    return emitAtomicBinary(MI, BB);

  case Mips::ATOMIC_LOAD_OR_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+    return emitAtomicBinaryPartword(MI, BB, 1);
  case Mips::ATOMIC_LOAD_OR_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+    return emitAtomicBinaryPartword(MI, BB, 2);
  case Mips::ATOMIC_LOAD_OR_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::OR);
+    return emitAtomicBinary(MI, BB);
  case Mips::ATOMIC_LOAD_OR_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::OR64);
+    return emitAtomicBinary(MI, BB);

  case Mips::ATOMIC_LOAD_XOR_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+    return emitAtomicBinaryPartword(MI, BB, 1);
  case Mips::ATOMIC_LOAD_XOR_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+    return emitAtomicBinaryPartword(MI, BB, 2);
  case Mips::ATOMIC_LOAD_XOR_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::XOR);
+    return emitAtomicBinary(MI, BB);
  case Mips::ATOMIC_LOAD_XOR_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
+    return emitAtomicBinary(MI, BB);

  case Mips::ATOMIC_LOAD_NAND_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
+    return emitAtomicBinaryPartword(MI, BB, 1);
  case Mips::ATOMIC_LOAD_NAND_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
+    return emitAtomicBinaryPartword(MI, BB, 2);
  case Mips::ATOMIC_LOAD_NAND_I32:
-    return emitAtomicBinary(MI, BB, 4, 0, true);
+    return emitAtomicBinary(MI, BB);
  case Mips::ATOMIC_LOAD_NAND_I64:
-    return emitAtomicBinary(MI, BB, 8, 0, true);
+    return emitAtomicBinary(MI, BB);

  case Mips::ATOMIC_LOAD_SUB_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+    return emitAtomicBinaryPartword(MI, BB, 1);
  case Mips::ATOMIC_LOAD_SUB_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+    return emitAtomicBinaryPartword(MI, BB, 2);
  case Mips::ATOMIC_LOAD_SUB_I32:
-    return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
+    return emitAtomicBinary(MI, BB);
  case Mips::ATOMIC_LOAD_SUB_I64:
-    return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+    return emitAtomicBinary(MI, BB);

  case Mips::ATOMIC_SWAP_I8:
-    return emitAtomicBinaryPartword(MI, BB, 1, 0);
+    return emitAtomicBinaryPartword(MI, BB, 1);
  case Mips::ATOMIC_SWAP_I16:
-    return emitAtomicBinaryPartword(MI, BB, 2, 0);
+    return emitAtomicBinaryPartword(MI, BB, 2);
  case Mips::ATOMIC_SWAP_I32:
-    return emitAtomicBinary(MI, BB, 4, 0);
+    return emitAtomicBinary(MI, BB);
  case Mips::ATOMIC_SWAP_I64:
-    return emitAtomicBinary(MI, BB, 8, 0);
+    return emitAtomicBinary(MI, BB);

  case Mips::ATOMIC_CMP_SWAP_I8:
    return emitAtomicCmpSwapPartword(MI, BB, 1);
  case Mips::ATOMIC_CMP_SWAP_I16:
    return emitAtomicCmpSwapPartword(MI, BB, 2);
  case Mips::ATOMIC_CMP_SWAP_I32:
-    return emitAtomicCmpSwap(MI, BB, 4);
+    return emitAtomicCmpSwap(MI, BB);
  case Mips::ATOMIC_CMP_SWAP_I64:
-    return emitAtomicCmpSwap(MI, BB, 8);
+    return emitAtomicCmpSwap(MI, BB);
  case Mips::PseudoSDIV:
  case Mips::PseudoUDIV:
  case Mips::DIV:
@ -1398,99 +1398,121 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

 // This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
 // Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
-MachineBasicBlock *MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
-                                                        MachineBasicBlock *BB,
-                                                        unsigned Size,
-                                                        unsigned BinOpcode,
-                                                        bool Nand) const {
-  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();
  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const bool ArePtrs64bit = ABI.ArePtrs64bit();
  DebugLoc DL = MI.getDebugLoc();
-  unsigned LL, SC, AND, NOR, ZERO, BEQ;

-  if (Size == 4) {
-    if (isMicroMips) {
-      LL = Subtarget.hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
-      SC = Subtarget.hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
-    } else {
-      LL = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-      SC = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
-    }
-
-    AND = Mips::AND;
-    NOR = Mips::NOR;
-    ZERO = Mips::ZERO;
-    BEQ = Mips::BEQ;
-  } else {
-    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
-    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
-    AND = Mips::AND64;
-    NOR = Mips::NOR64;
-    ZERO = Mips::ZERO_64;
-    BEQ = Mips::BEQ64;
+  unsigned AtomicOp;
+  switch (MI.getOpcode()) {
+  case Mips::ATOMIC_LOAD_ADD_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I32:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I32:
+    AtomicOp = Mips::ATOMIC_SWAP_I32_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I64:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I64_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I64:
+    AtomicOp = Mips::ATOMIC_SWAP_I64_POSTRA;
+    break;
+  default:
+    llvm_unreachable("Unknown pseudo atomic for replacement!");
  }

  unsigned OldVal = MI.getOperand(0).getReg();
  unsigned Ptr = MI.getOperand(1).getReg();
  unsigned Incr = MI.getOperand(2).getReg();
+  unsigned Scratch = RegInfo.createVirtualRegister(RegInfo.getRegClass(OldVal));

-  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
-  unsigned AndRes = RegInfo.createVirtualRegister(RC);
-  unsigned Success = RegInfo.createVirtualRegister(RC);
+  MachineBasicBlock::iterator II(MI);

-  // insert new blocks after the current block
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
+  // The scratch registers here with the EarlyClobber | Define | Implicit
+  // flags is used to persuade the register allocator and the machine
+  // verifier to accept the usage of this register. This has to be a real
+  // register which has an UNDEF value but is dead after the instruction which
+  // is unique among the registers chosen for the instruction.

-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+  // The EarlyClobber flag has the semantic properties that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction.
+  // The Define flag is needed to coerce the machine verifier that an Undef
+  // value isn't a problem.
+  // The Dead flag is needed as the value in scratch isn't used by any other
+  // instruction. Kill isn't used as Dead is more precise.
+  // The implicit flag is here due to the interaction between the other flags
+  // and the machine verifier.

-  //  thisMBB:
-  //    ...
-  //    fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(exitMBB);
+  // For correctness purpose, a new pseudo is introduced here. We need this
+  // new pseudo, so that FastRegisterAllocator does not see an ll/sc sequence
+  // that is spread over >1 basic blocks. A register allocator which
+  // introduces (or any codegen infact) a store, can violate the expectations
+  // of the hardware.
+  //
+  // An atomic read-modify-write sequence starts with a linked load
+  // instruction and ends with a store conditional instruction. The atomic
+  // read-modify-write sequence fails if any of the following conditions
+  // occur between the execution of ll and sc:
+  //   * A coherent store is completed by another process or coherent I/O
+  //     module into the block of synchronizable physical memory containing
+  //     the word. The size and alignment of the block is
+  //     implementation-dependent.
+  //   * A coherent store is executed between an LL and SC sequence on the
+  //     same processor to the block of synchornizable physical memory
+  //     containing the word.
+  //

-  //  loopMBB:
-  //    ll oldval, 0(ptr)
-  //    <binop> storeval, oldval, incr
-  //    sc success, storeval, 0(ptr)
-  //    beq success, $0, loopMBB
-  BB = loopMBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
-  if (Nand) {
-    //  and andres, oldval, incr
-    //  nor storeval, $0, andres
-    BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
-    BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
-  } else if (BinOpcode) {
-    //  <binop> storeval, oldval, incr
-    BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
-  } else {
-    StoreVal = Incr;
-  }
-  BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
-  BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
+  unsigned PtrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Ptr));
+  unsigned IncrCopy = RegInfo.createVirtualRegister(RegInfo.getRegClass(Incr));

-  MI.eraseFromParent(); // The instruction is gone now.
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), IncrCopy).addReg(Incr);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);

-  return exitMBB;
+  BuildMI(*BB, II, DL, TII->get(AtomicOp))
+      .addReg(OldVal, RegState::Define | RegState::EarlyClobber)
+      .addReg(PtrCopy)
+      .addReg(IncrCopy)
+      .addReg(Scratch, RegState::Define | RegState::EarlyClobber |
+                           RegState::Implicit | RegState::Dead);
+
+  MI.eraseFromParent();
+
+  return BB;
 }

 MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
@ -1524,8 +1546,7 @@ MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
 }

 MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
-    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
-    bool Nand) const {
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size) const {
  assert((Size == 1 || Size == 2) &&
         "Unsupported size for EmitAtomicBinaryPartial.");

@ -1546,39 +1567,66 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
  unsigned Mask = RegInfo.createVirtualRegister(RC);
  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned NewVal = RegInfo.createVirtualRegister(RC);
-  unsigned OldVal = RegInfo.createVirtualRegister(RC);
  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
-  unsigned AndRes = RegInfo.createVirtualRegister(RC);
-  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
-  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
-  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned Success = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch2 = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch3 = RegInfo.createVirtualRegister(RC);

-  unsigned LL, SC;
-  if (isMicroMips) {
-    LL = Subtarget.hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
-    SC = Subtarget.hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
-  } else {
-    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  unsigned AtomicOp = 0;
+  switch (MI.getOpcode()) {
+  case Mips::ATOMIC_LOAD_NAND_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_NAND_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_NAND_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I8:
+    AtomicOp = Mips::ATOMIC_SWAP_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_SWAP_I16:
+    AtomicOp = Mips::ATOMIC_SWAP_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_ADD_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_ADD_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_SUB_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_SUB_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_AND_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_AND_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_OR_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_OR_I16_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I8:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I8_POSTRA;
+    break;
+  case Mips::ATOMIC_LOAD_XOR_I16:
+    AtomicOp = Mips::ATOMIC_LOAD_XOR_I16_POSTRA;
+    break;
+  default:
+    llvm_unreachable("Unknown subword atomic pseudo for expansion!");
  }

  // insert new blocks after the current block
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loopMBB);
-  MF->insert(It, sinkMBB);
  MF->insert(It, exitMBB);

  // Transfer the remainder of BB and its successor edges to exitMBB.
@ -1586,10 +1634,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
  exitMBB->transferSuccessorsAndUpdatePHIs(BB);

-  BB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(loopMBB);
-  loopMBB->addSuccessor(sinkMBB);
-  sinkMBB->addSuccessor(exitMBB);
+  BB->addSuccessor(exitMBB, BranchProbability::getOne());

  //  thisMBB:
  //    addiu   masklsb2,$0,-4                # 0xfffffffc
@ -1623,159 +1668,92 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
  BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);

-  // atomic.load.binop
-  // loopMBB:
-  //   ll      oldval,0(alignedaddr)
-  //   binop   binopres,oldval,incr2
-  //   and     newval,binopres,mask
-  //   and     maskedoldval0,oldval,mask2
-  //   or      storeval,maskedoldval0,newval
-  //   sc      success,storeval,0(alignedaddr)
-  //   beq     success,$0,loopMBB

-  // atomic.swap
-  // loopMBB:
-  //   ll      oldval,0(alignedaddr)
-  //   and     newval,incr2,mask
-  //   and     maskedoldval0,oldval,mask2
-  //   or      storeval,maskedoldval0,newval
-  //   sc      success,storeval,0(alignedaddr)
-  //   beq     success,$0,loopMBB
+  // The purposes of the flags on the scratch registers is explained in
+  // emitAtomicBinary. In summary, we need a scratch register which is going to
+  // be undef, that is unique among registers chosen for the instruction.

-  BB = loopMBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
-  if (Nand) {
-    //  and andres, oldval, incr2
-    //  nor binopres, $0, andres
-    //  and newval, binopres, mask
-    BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
-    BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
-      .addReg(Mips::ZERO).addReg(AndRes);
-    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
-  } else if (BinOpcode) {
-    //  <binop> binopres, oldval, incr2
-    //  and newval, binopres, mask
-    BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
-    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
-  } else { // atomic.swap
-    //  and newval, incr2, mask
-    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
-  }
-
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
-    .addReg(OldVal).addReg(Mask2);
-  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
-    .addReg(MaskedOldVal0).addReg(NewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
-    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, DL, TII->get(Mips::BEQ))
-    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
-
-  //  sinkMBB:
-  //    and     maskedoldval1,oldval,mask
-  //    srl     srlres,maskedoldval1,shiftamt
-  //    sign_extend dest,srlres
-  BB = sinkMBB;
-
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
-    .addReg(OldVal).addReg(Mask);
-  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
-      .addReg(MaskedOldVal1).addReg(ShiftAmt);
-  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+  BuildMI(BB, DL, TII->get(AtomicOp))
+      .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+      .addReg(AlignedAddr)
+      .addReg(Incr2)
+      .addReg(Mask)
+      .addReg(Mask2)
+      .addReg(ShiftAmt)
+      .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+                           RegState::Dead | RegState::Implicit)
+      .addReg(Scratch2, RegState::EarlyClobber | RegState::Define |
+                            RegState::Dead | RegState::Implicit)
+      .addReg(Scratch3, RegState::EarlyClobber | RegState::Define |
+                            RegState::Dead | RegState::Implicit);

  MI.eraseFromParent(); // The instruction is gone now.

  return exitMBB;
 }

-MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
-                                                         MachineBasicBlock *BB,
-                                                         unsigned Size) const {
-  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
+// Lower atomic compare and swap to a pseudo instruction, taking care to
+// define a scratch register for the pseudo instruction's expansion. The
+// instruction is expanded after the register allocator as to prevent
+// the insertion of stores between the linked load and the store conditional.
+
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
+                                      MachineBasicBlock *BB) const {
+
+  assert((MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ||
+          MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I64) &&
+         "Unsupported atomic psseudo for EmitAtomicCmpSwap.");
+
+  const unsigned Size = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32 ? 4 : 8;

  MachineFunction *MF = BB->getParent();
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  const bool ArePtrs64bit = ABI.ArePtrs64bit();
  DebugLoc DL = MI.getDebugLoc();
-  unsigned LL, SC, ZERO, BNE, BEQ;
-
-  if (Size == 4) {
-    if (isMicroMips) {
-      LL = Subtarget.hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
-      SC = Subtarget.hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
-    } else {
-      LL = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-      SC = Subtarget.hasMips32r6()
-               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
-    }
-
-    ZERO = Mips::ZERO;
-    BNE = Mips::BNE;
-    BEQ = Mips::BEQ;
-  } else {
-    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
-    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
-    ZERO = Mips::ZERO_64;
-    BNE = Mips::BNE64;
-    BEQ = Mips::BEQ64;
-  }

+  unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I32
+                          ? Mips::ATOMIC_CMP_SWAP_I32_POSTRA
+                          : Mips::ATOMIC_CMP_SWAP_I64_POSTRA;
  unsigned Dest = MI.getOperand(0).getReg();
  unsigned Ptr = MI.getOperand(1).getReg();
  unsigned OldVal = MI.getOperand(2).getReg();
  unsigned NewVal = MI.getOperand(3).getReg();

-  unsigned Success = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch = MRI.createVirtualRegister(RC);
+  MachineBasicBlock::iterator II(MI);

-  // insert new blocks after the current block
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
+  // We need to create copies of the various registers and kill them at the
+  // atomic pseudo. If the copies are not made, when the atomic is expanded
+  // after fast register allocation, the spills will end up outside of the
+  // blocks that their values are defined in, causing livein errors.

-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+  unsigned DestCopy = MRI.createVirtualRegister(MRI.getRegClass(Dest));
+  unsigned PtrCopy = MRI.createVirtualRegister(MRI.getRegClass(Ptr));
+  unsigned OldValCopy = MRI.createVirtualRegister(MRI.getRegClass(OldVal));
+  unsigned NewValCopy = MRI.createVirtualRegister(MRI.getRegClass(NewVal));

-  //  thisMBB:
-  //    ...
-  //    fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-  loop1MBB->addSuccessor(exitMBB);
-  loop1MBB->addSuccessor(loop2MBB);
-  loop2MBB->addSuccessor(loop1MBB);
-  loop2MBB->addSuccessor(exitMBB);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), DestCopy).addReg(Dest);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), PtrCopy).addReg(Ptr);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), OldValCopy).addReg(OldVal);
+  BuildMI(*BB, II, DL, TII->get(Mips::COPY), NewValCopy).addReg(NewVal);

-  // loop1MBB:
-  //   ll dest, 0(ptr)
-  //   bne dest, oldval, exitMBB
-  BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
-  BuildMI(BB, DL, TII->get(BNE))
-    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
+  // The purposes of the flags on the scratch registers is explained in
+  // emitAtomicBinary. In summary, we need a scratch register which is going to
+  // be undef, that is unique among registers chosen for the instruction.

-  // loop2MBB:
-  //   sc success, newval, 0(ptr)
-  //   beq success, $0, loop1MBB
-  BB = loop2MBB;
-  BuildMI(BB, DL, TII->get(SC), Success)
-    .addReg(NewVal).addReg(Ptr).addImm(0);
-  BuildMI(BB, DL, TII->get(BEQ))
-    .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
+  BuildMI(*BB, II, DL, TII->get(AtomicOp))
+      .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+      .addReg(PtrCopy, RegState::Kill)
+      .addReg(OldValCopy, RegState::Kill)
+      .addReg(NewValCopy, RegState::Kill)
+      .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+                           RegState::Dead | RegState::Implicit);

  MI.eraseFromParent(); // The instruction is gone now.

-  return exitMBB;
+  return BB;
 }

 MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
@ -1802,40 +1780,33 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
  unsigned Mask = RegInfo.createVirtualRegister(RC);
  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
-  unsigned OldVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
-  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
-  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
-  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned Success = RegInfo.createVirtualRegister(RC);
-  unsigned LL, SC;
+  unsigned AtomicOp = MI.getOpcode() == Mips::ATOMIC_CMP_SWAP_I8
+                          ? Mips::ATOMIC_CMP_SWAP_I8_POSTRA
+                          : Mips::ATOMIC_CMP_SWAP_I16_POSTRA;

-  if (isMicroMips) {
-    LL = Subtarget.hasMips32r6() ? Mips::LL_MMR6 : Mips::LL_MM;
-    SC = Subtarget.hasMips32r6() ? Mips::SC_MMR6 : Mips::SC_MM;
-  } else {
-    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
-                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
-    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
-                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
-  }
+  // The scratch registers here with the EarlyClobber | Define | Dead | Implicit
+  // flags are used to coerce the register allocator and the machine verifier to
+  // accept the usage of these registers.
+  // The EarlyClobber flag has the semantic properties that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction.
+  // The Define flag is needed to coerce the machine verifier that an Undef
+  // value isn't a problem.
+  // The Dead flag is needed as the value in scratch isn't used by any other
+  // instruction. Kill isn't used as Dead is more precise.
+  unsigned Scratch = RegInfo.createVirtualRegister(RC);
+  unsigned Scratch2 = RegInfo.createVirtualRegister(RC);

  // insert new blocks after the current block
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  MachineFunction::iterator It = ++BB->getIterator();
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, sinkMBB);
  MF->insert(It, exitMBB);

  // Transfer the remainder of BB and its successor edges to exitMBB.
@ -1843,14 +1814,8 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
  exitMBB->transferSuccessorsAndUpdatePHIs(BB);

-  BB->addSuccessor(loop1MBB);
-  loop1MBB->addSuccessor(sinkMBB);
-  loop1MBB->addSuccessor(loop2MBB);
-  loop2MBB->addSuccessor(loop1MBB);
-  loop2MBB->addSuccessor(sinkMBB);
-  sinkMBB->addSuccessor(exitMBB);
+  BB->addSuccessor(exitMBB, BranchProbability::getOne());

-  // FIXME: computation of newval2 can be moved to loop2MBB.
  //  thisMBB:
  //    addiu   masklsb2,$0,-4                # 0xfffffffc
  //    and     alignedaddr,ptr,masklsb2
@ -1893,40 +1858,22 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
    .addReg(MaskedNewVal).addReg(ShiftAmt);

-  //  loop1MBB:
-  //    ll      oldval,0(alginedaddr)
-  //    and     maskedoldval0,oldval,mask
-  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
-  BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
-    .addReg(OldVal).addReg(Mask);
-  BuildMI(BB, DL, TII->get(Mips::BNE))
-    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
+  // The purposes of the flags on the scratch registers are explained in
+  // emitAtomicBinary. In summary, we need a scratch register which is going to
+  // be undef, that is unique among the register chosen for the instruction.

-  //  loop2MBB:
-  //    and     maskedoldval1,oldval,mask2
-  //    or      storeval,maskedoldval1,shiftednewval
-  //    sc      success,storeval,0(alignedaddr)
-  //    beq     success,$0,loop1MBB
-  BB = loop2MBB;
-  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
-    .addReg(OldVal).addReg(Mask2);
-  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
-    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
-      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
-  BuildMI(BB, DL, TII->get(Mips::BEQ))
-      .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
-
-  //  sinkMBB:
-  //    srl     srlres,maskedoldval0,shiftamt
-  //    sign_extend dest,srlres
-  BB = sinkMBB;
-
-  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
-      .addReg(MaskedOldVal0).addReg(ShiftAmt);
-  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+  BuildMI(BB, DL, TII->get(AtomicOp))
+      .addReg(Dest, RegState::Define | RegState::EarlyClobber)
+      .addReg(AlignedAddr)
+      .addReg(Mask)
+      .addReg(ShiftedCmpVal)
+      .addReg(Mask2)
+      .addReg(ShiftedNewVal)
+      .addReg(ShiftAmt)
+      .addReg(Scratch, RegState::EarlyClobber | RegState::Define |
+                           RegState::Dead | RegState::Implicit)
+      .addReg(Scratch2, RegState::EarlyClobber | RegState::Define |
+                            RegState::Dead | RegState::Implicit);

  MI.eraseFromParent(); // The instruction is gone now.

--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@ -679,17 +679,13 @@ class TargetRegisterClass;
                                                unsigned Size, unsigned DstReg,
                                                unsigned SrcRec) const;

-    MachineBasicBlock *emitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
-                                        unsigned Size, unsigned BinOpcode,
-                                        bool Nand = false) const;
+    MachineBasicBlock *emitAtomicBinary(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const;
    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr &MI,
                                                MachineBasicBlock *BB,
-                                                unsigned Size,
-                                                unsigned BinOpcode,
-                                                bool Nand = false) const;
+                                                unsigned Size) const;
    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr &MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned Size) const;
+                                         MachineBasicBlock *BB) const;
    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr &MI,
                                                 MachineBasicBlock *BB,
                                                 unsigned Size) const;
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@ -1852,11 +1852,37 @@ class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;

+class Atomic2OpsPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr), []> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class Atomic2OpsSubwordPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$incr, RC:$mask, RC:$mask2,
+                                RC:$shiftamnt), []>;
+
 // Atomic Compare & Swap.
+// Atomic compare and swap is lowered into two stages. The first stage happens
+// during ISelLowering, which produces the PostRA version of this instruction.
 class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;

+class AtomicCmpSwapPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$cmp, RC:$swap), []> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class AtomicCmpSwapSubwordPostRA<RegisterClass RC> :
+  PseudoSE<(outs RC:$dst), (ins PtrRC:$ptr, RC:$mask, RC:$ShiftCmpVal,
+                                RC:$mask2, RC:$ShiftNewVal, RC:$ShiftAmt), []> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+
 class LLBase<string opstr, RegisterOperand RO, DAGOperand MO = mem> :
  InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
         [], II_LL, FrmI, opstr> {
@ -1942,8 +1968,36 @@ let usesCustomInserter = 1 in {
  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
+
 }

+def ATOMIC_LOAD_ADD_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_ADD_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_ADD_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_SUB_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_AND_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I8_POSTRA    : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I16_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_OR_I32_POSTRA   : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I8_POSTRA   : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I16_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_XOR_I32_POSTRA  : Atomic2OpsPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I8_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_LOAD_NAND_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+
+def ATOMIC_SWAP_I8_POSTRA  : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_SWAP_I16_POSTRA : Atomic2OpsSubwordPostRA<GPR32>;
+def ATOMIC_SWAP_I32_POSTRA : Atomic2OpsPostRA<GPR32>;
+
+def ATOMIC_CMP_SWAP_I8_POSTRA : AtomicCmpSwapSubwordPostRA<GPR32>;
+def ATOMIC_CMP_SWAP_I16_POSTRA : AtomicCmpSwapSubwordPostRA<GPR32>;
+def ATOMIC_CMP_SWAP_I32_POSTRA : AtomicCmpSwapPostRA<GPR32>;
+
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
  def LOAD_ACC64  : Load<"", ACC64>;
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@ -240,6 +240,7 @@ public:
  bool addInstSelector() override;
  void addPreEmitPass() override;
  void addPreRegAlloc() override;
+  void addPreEmit2() ;
  bool addIRTranslator() override;
  bool addLegalizeMachineIR() override;
  bool addRegBankSelect() override;
@ -285,10 +286,18 @@ MipsTargetMachine::getTargetTransformInfo(const Function &F) {
  return TargetTransformInfo(BasicTTIImpl(this, F));
 }

+void MipsPassConfig::addPreEmit2() {
+}
+
 // Implemented by targets that want to run passes immediately before
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 void MipsPassConfig::addPreEmitPass() {
+  // Expand pseudo instructions that are sensitive to register allocation.
+  addPass(createMipsExpandPseudoPass());
+
+  // The microMIPS size reduction pass performs instruction reselection for
+  // instructions which can be remapped to a 16 bit instruction.
  addPass(createMicroMipsSizeReducePass());

  // The delay slot filler pass can potientially create forbidden slot hazards
--- a/llvm/test/CodeGen/Mips/atomic.ll
+++ b/llvm/test/CodeGen/Mips/atomic.ll
--- a/llvm/test/CodeGen/Mips/atomic64.ll
+++ b/llvm/test/CodeGen/Mips/atomic64.ll
--- a/llvm/test/CodeGen/Mips/atomicCmpSwapPW.ll
+++ b/llvm/test/CodeGen/Mips/atomicCmpSwapPW.ll
@ -1,17 +1,113 @@
-; RUN: llc -O0 -march=mipsel -mcpu=mips32r2 -target-abi=o32 < %s -filetype=asm -o - \
-; RUN:   | FileCheck -check-prefixes=PTR32,ALL %s
-; RUN: llc -O0 -march=mips64el -mcpu=mips64r2 -target-abi=n32 < %s -filetype=asm -o - \
-; RUN:   | FileCheck  -check-prefixes=PTR32,ALL %s
-; RUN: llc -O0 -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s -filetype=asm -o - \
-; RUN:   | FileCheck -check-prefixes=PTR64,ALL %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=mipsel-unknown-linux-gnu -mcpu=mips32r2 -target-abi=o32 < %s -filetype=asm -o - \
+; RUN:   | FileCheck -check-prefixes=O32 %s
+; RUN: llc -O0 -mtriple=mips64el-unknown-linux-gnu -mcpu=mips64r2 -target-abi=n32 < %s -filetype=asm -o - \
+; RUN:   | FileCheck  -check-prefixes=N32,ALL %s
+; RUN: llc -O0 -mtriple=mips64el-unknown-linux-gnu -mcpu=mips64r2 -target-abi=n64 < %s -filetype=asm -o - \
+; RUN:   | FileCheck -check-prefixes=N64 %s

-; PTR32: lw $[[R0:[0-9]+]]
-; PTR64: ld $[[R0:[0-9]+]]
+@sym = external global i32 *

-; ALL: ll ${{[0-9]+}}, 0($[[R0]])
-
-define {i16, i1} @foo(i16* %addr, i16 signext %r, i16 zeroext %new) {
-  %res = cmpxchg i16* %addr, i16 %r, i16 %new seq_cst seq_cst
-  ret {i16, i1} %res
+define void @foo(i32 %new, i32 %old) {
+; O32-LABEL: foo:
+; O32:       # %bb.0: # %entry
+; O32-NEXT:    addiu $sp, $sp, -16
+; O32-NEXT:    .cfi_def_cfa_offset 16
+; O32-NEXT:    move $1, $5
+; O32-NEXT:    move $2, $4
+; O32-NEXT:    lui $3, %hi(sym)
+; O32-NEXT:    lw $3, %lo(sym)($3)
+; O32-NEXT:    sync
+; O32-NEXT:    lw $6, 12($sp) # 4-byte Folded Reload
+; O32-NEXT:  $BB0_1: # %entry
+; O32-NEXT:    # =>This Inner Loop Header: Depth=1
+; O32-NEXT:    ll $7, 0($3)
+; O32-NEXT:    bne $7, $4, $BB0_3
+; O32-NEXT:    nop
+; O32-NEXT:  # %bb.2: # %entry
+; O32-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; O32-NEXT:    move $8, $5
+; O32-NEXT:    sc $8, 0($3)
+; O32-NEXT:    beqz $8, $BB0_1
+; O32-NEXT:    nop
+; O32-NEXT:  $BB0_3: # %entry
+; O32-NEXT:    sync
+; O32-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $6, 8($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
+; O32-NEXT:    addiu $sp, $sp, 16
+; O32-NEXT:    jr $ra
+; O32-NEXT:    nop
+;
+; N32-LABEL: foo:
+; N32:       # %bb.0: # %entry
+; N32-NEXT:    addiu $sp, $sp, -16
+; N32-NEXT:    .cfi_def_cfa_offset 16
+; N32-NEXT:    move $1, $5
+; N32-NEXT:    sll $1, $1, 0
+; N32-NEXT:    move $2, $4
+; N32-NEXT:    sll $2, $2, 0
+; N32-NEXT:    lui $3, %hi(sym)
+; N32-NEXT:    lw $3, %lo(sym)($3)
+; N32-NEXT:    sync
+; N32-NEXT:    lw $6, 12($sp) # 4-byte Folded Reload
+; N32-NEXT:  .LBB0_1: # %entry
+; N32-NEXT:    # =>This Inner Loop Header: Depth=1
+; N32-NEXT:    ll $7, 0($3)
+; N32-NEXT:    bne $7, $2, .LBB0_3
+; N32-NEXT:    nop
+; N32-NEXT:  # %bb.2: # %entry
+; N32-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; N32-NEXT:    move $8, $1
+; N32-NEXT:    sc $8, 0($3)
+; N32-NEXT:    beqz $8, .LBB0_1
+; N32-NEXT:    nop
+; N32-NEXT:  .LBB0_3: # %entry
+; N32-NEXT:    sync
+; N32-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
+; N32-NEXT:    sw $6, 8($sp) # 4-byte Folded Spill
+; N32-NEXT:    addiu $sp, $sp, 16
+; N32-NEXT:    jr $ra
+; N32-NEXT:    nop
+;
+; N64-LABEL: foo:
+; N64:       # %bb.0: # %entry
+; N64-NEXT:    daddiu $sp, $sp, -16
+; N64-NEXT:    .cfi_def_cfa_offset 16
+; N64-NEXT:    move $1, $5
+; N64-NEXT:    sll $1, $1, 0
+; N64-NEXT:    move $2, $4
+; N64-NEXT:    sll $2, $2, 0
+; N64-NEXT:    lui $4, %highest(sym)
+; N64-NEXT:    daddiu $4, $4, %higher(sym)
+; N64-NEXT:    dsll $4, $4, 16
+; N64-NEXT:    daddiu $4, $4, %hi(sym)
+; N64-NEXT:    dsll $4, $4, 16
+; N64-NEXT:    ld $4, %lo(sym)($4)
+; N64-NEXT:    sync
+; N64-NEXT:    lw $3, 12($sp) # 4-byte Folded Reload
+; N64-NEXT:  .LBB0_1: # %entry
+; N64-NEXT:    # =>This Inner Loop Header: Depth=1
+; N64-NEXT:    ll $6, 0($4)
+; N64-NEXT:    bne $6, $2, .LBB0_3
+; N64-NEXT:    nop
+; N64-NEXT:  # %bb.2: # %entry
+; N64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; N64-NEXT:    move $7, $1
+; N64-NEXT:    sc $7, 0($4)
+; N64-NEXT:    beqz $7, .LBB0_1
+; N64-NEXT:    nop
+; N64-NEXT:  .LBB0_3: # %entry
+; N64-NEXT:    sync
+; N64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; N64-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
+; N64-NEXT:    daddiu $sp, $sp, 16
+; N64-NEXT:    jr $ra
+; N64-NEXT:    nop
+entry:
+  %0 = load i32 *, i32 ** @sym
+  cmpxchg i32 * %0, i32 %new, i32 %old seq_cst seq_cst
+  ret void
 }

--- a/llvm/test/CodeGen/Mips/micromips-atomic.ll
+++ b/llvm/test/CodeGen/Mips/micromips-atomic.ll
@ -1,18 +1,25 @@
-; RUN: llc %s -march=mipsel -mcpu=mips32r2 -mattr=micromips -filetype=asm \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -mtriple=mipsel-unknown-linux-gnu -mcpu=mips32r2 -mattr=micromips -filetype=asm \
 ; RUN: -relocation-model=pic -o - | FileCheck %s

@x = common global i32 0, align 4

 define i32 @AtomicLoadAdd32(i32 %incr) nounwind {
+; CHECK-LABEL: AtomicLoadAdd32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui $2, %hi(_gp_disp)
+; CHECK-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; CHECK-NEXT:    addu $2, $2, $25
+; CHECK-NEXT:    lw $1, %got(x)($2)
+; CHECK-NEXT:  $BB0_1: # %entry
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ll $2, 0($1)
+; CHECK-NEXT:    addu16 $3, $2, $4
+; CHECK-NEXT:    sc $3, 0($1)
+; CHECK-NEXT:    beqzc $3, $BB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    jrc $ra
 entry:
  %0 = atomicrmw add i32* @x, i32 %incr monotonic
  ret i32 %0
-
-; CHECK-LABEL:   AtomicLoadAdd32:
-; CHECK:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK:   addu    $[[R2:[0-9]+]], $[[R1]], $4
-; CHECK:   sc      $[[R2]], 0($[[R0]])
-; CHECK:   beqzc   $[[R2]], $[[BB0]]
 }