diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index c618c4f60e5c..a2bb2b2922a8 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -4955,14 +4955,16 @@ bool AArch64FastISel::selectAtomicCmpXchg(const AtomicCmpXchgInst *I) {
     return false;
 
   const TargetRegisterClass *ResRC;
-  unsigned Opc;
+  unsigned Opc, CmpOpc;
   // This only supports i32/i64, because i8/i16 aren't legal, and the generic
   // extractvalue selection doesn't support that.
   if (VT == MVT::i32) {
     Opc = AArch64::CMP_SWAP_32;
+    CmpOpc = AArch64::SUBSWrs;
     ResRC = &AArch64::GPR32RegClass;
   } else if (VT == MVT::i64) {
     Opc = AArch64::CMP_SWAP_64;
+    CmpOpc = AArch64::SUBSXrs;
     ResRC = &AArch64::GPR64RegClass;
   } else {
     return false;
@@ -4979,14 +4981,27 @@ bool AArch64FastISel::selectAtomicCmpXchg(const AtomicCmpXchgInst *I) {
 
   const unsigned ResultReg1 = createResultReg(ResRC);
   const unsigned ResultReg2 = createResultReg(&AArch64::GPR32RegClass);
+  const unsigned ScratchReg = createResultReg(&AArch64::GPR32RegClass);
 
   // FIXME: MachineMemOperand doesn't support cmpxchg yet.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(ResultReg1, RegState::Define)
-      .addReg(ResultReg2, RegState::Define)
-      .addReg(AddrReg)
-      .addReg(DesiredReg)
-      .addReg(NewReg);
+      .addDef(ResultReg1)
+      .addDef(ScratchReg)
+      .addUse(AddrReg)
+      .addUse(DesiredReg)
+      .addUse(NewReg);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+      .addDef(VT == MVT::i32 ? AArch64::WZR : AArch64::XZR)
+      .addUse(ResultReg1)
+      .addUse(DesiredReg)
+      .addImm(0);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr))
+      .addDef(ResultReg2)
+      .addUse(AArch64::WZR)
+      .addUse(AArch64::WZR)
+      .addImm(AArch64CC::NE);
 
   assert((ResultReg1 + 1) == ResultReg2 && "Nonconsecutive result registers.");
   updateValueMap(I, ResultReg1, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 59de62ad2877..867074c3c374 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -377,28 +377,28 @@ def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
 // significantly more naive than the standard expansion: we conservatively
 // assume seq_cst, strong cmpxchg and omit clrex on failure.
 
-let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+let Constraints = "@earlyclobber $Rd,@earlyclobber $scratch",
     mayLoad = 1, mayStore = 1 in {
-def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
                         (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
                  Sched<[WriteAtomic]>;
 
-def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
                          (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
                   Sched<[WriteAtomic]>;
 
-def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
                          (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
                   Sched<[WriteAtomic]>;
 
-def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$status),
+def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch),
                          (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>,
                   Sched<[WriteAtomic]>;
 }
 
-let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $status",
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch",
     mayLoad = 1, mayStore = 1 in
-def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$status),
+def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
                           (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
                                GPR64:$newLo, GPR64:$newHi), []>,
                    Sched<[WriteAtomic]>;
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll b/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
index d20e693df83e..aa78210fae74 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
@@ -9,10 +9,11 @@
 ; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], w2, [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
+; CHECK-NEXT:     cmp [[OLD]], w1
+; CHECK-NEXT:     cset [[STATUS:w[0-9]+]], eq
 ; CHECK-NEXT:     and [[STATUS32:w[0-9]+]], [[STATUS]], #0x1
 ; CHECK-NEXT:     str [[STATUS32]], [x3]
 ; CHECK-NEXT:     mov w0, [[OLD]]
-; CHECK-NEXT:     ret
 define i32 @cmpxchg_monotonic_32(i32* %p, i32 %cmp, i32 %new, i32* %ps) #0 {
   %tmp0 = cmpxchg i32* %p, i32 %cmp, i32 %new monotonic monotonic
   %tmp1 = extractvalue { i32, i1 } %tmp0, 0
@@ -24,7 +25,7 @@ define i32 @cmpxchg_monotonic_32(i32* %p, i32 %cmp, i32 %new, i32* %ps) #0 {
 
 ; CHECK-LABEL: cmpxchg_acq_rel_32_load:
 ; CHECK:      // BB#0:
-; CHECK-NEXT:     ldr [[NEW:w[0-9]+]], [x2]
+; CHECK:     ldr [[NEW:w[0-9]+]], [x2]
 ; CHECK-NEXT: [[RETRY:.LBB[0-9_]+]]:
 ; CHECK-NEXT:     ldaxr [[OLD:w[0-9]+]], [x0]
 ; CHECK-NEXT:     cmp [[OLD]], w1
@@ -33,10 +34,11 @@ define i32 @cmpxchg_monotonic_32(i32* %p, i32 %cmp, i32 %new, i32* %ps) #0 {
 ; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], [[NEW]], [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
+; CHECK-NEXT:     cmp [[OLD]], w1
+; CHECK-NEXT:     cset [[STATUS:w[0-9]+]], eq
 ; CHECK-NEXT:     and [[STATUS32:w[0-9]+]], [[STATUS]], #0x1
 ; CHECK-NEXT:     str [[STATUS32]], [x3]
 ; CHECK-NEXT:     mov w0, [[OLD]]
-; CHECK-NEXT:     ret
 define i32 @cmpxchg_acq_rel_32_load(i32* %p, i32 %cmp, i32* %pnew, i32* %ps) #0 {
   %new = load i32, i32* %pnew
   %tmp0 = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel acquire
@@ -56,10 +58,11 @@ define i32 @cmpxchg_acq_rel_32_load(i32* %p, i32 %cmp, i32* %pnew, i32* %ps) #0
 ; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], x2, [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
+; CHECK-NEXT:     cmp [[OLD]], x1
+; CHECK-NEXT:     cset [[STATUS:w[0-9]+]], eq
 ; CHECK-NEXT:     and [[STATUS32:w[0-9]+]], [[STATUS]], #0x1
 ; CHECK-NEXT:     str [[STATUS32]], [x3]
 ; CHECK-NEXT:     mov x0, [[OLD]]
-; CHECK-NEXT:     ret
 define i64 @cmpxchg_seq_cst_64(i64* %p, i64 %cmp, i64 %new, i32* %ps) #0 {
   %tmp0 = cmpxchg i64* %p, i64 %cmp, i64 %new seq_cst seq_cst
   %tmp1 = extractvalue { i64, i1 } %tmp0, 0