Allow FP types for atomicrmw xchg

llvm-svn: 351427
2019-01-17 10:49:01 +00:00 · 2019-01-17 10:49:01 +00:00 · 0cb08e448a
parent bd13c9787f
commit 0cb08e448a
24 changed files with 375 additions and 16 deletions
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@ -8584,13 +8584,14 @@ operation. The operation must be one of the following keywords:
 -  umax
 -  umin
-The type of '<value>' must be an integer type whose bit width is a power
+For most of these operations, the type of '<value>' must be an integer
-of two greater than or equal to eight and less than or equal to a
+type whose bit width is a power of two greater than or equal to eight
-target-specific size limit. The type of the '``<pointer>``' operand must
+and less than or equal to a target-specific size limit. For xchg, this
-be a pointer to that type. If the ``atomicrmw`` is marked as
+may also be a floating point type with the same size constraints as
-``volatile``, then the optimizer is not allowed to modify the number or
+integers. The type of the '``<pointer>``' operand must be a pointer to
-order of execution of this ``atomicrmw`` with other :ref:`volatile
+that type. If the ``atomicrmw`` is marked as ``volatile``, then the
-operations <volatile>`.
+optimizer is not allowed to modify the number or order of execution of
 this ``atomicrmw`` with other :ref:`volatile operations <volatile>`.
 A ``atomicrmw`` instruction can also take an optional
 ":ref:`syncscope <syncscope>`" argument.
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@ -6850,12 +6850,20 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
  if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
    return Error(ValLoc, "atomicrmw value and pointer type do not match");
-  if (!Val->getType()->isIntegerTy()) {
+  if (Operation != AtomicRMWInst::Xchg && !Val->getType()->isIntegerTy()) {
    return Error(ValLoc, "atomicrmw " +
                 AtomicRMWInst::getOperationName(Operation) +
                 " operand must be an integer");
  }
  if (Operation == AtomicRMWInst::Xchg &&
      !Val->getType()->isIntegerTy() &&
      !Val->getType()->isFloatingPointTy()) {
    return Error(ValLoc, "atomicrmw " +
                 AtomicRMWInst::getOperationName(Operation) +
                 " operand must be an integer or floating point type");
  }
  unsigned Size = Val->getType()->getPrimitiveSizeInBits();
  if (Size < 8 || (Size & (Size - 1)))
    return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@ -496,11 +496,26 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
                                 Value *Loaded, Value *NewVal,
                                 AtomicOrdering MemOpOrder,
                                 Value *&Success, Value *&NewLoaded) {
  Type *OrigTy = NewVal->getType();
  // This code can go away when cmpxchg supports FP types.
  bool NeedBitcast = OrigTy->isFloatingPointTy();
  if (NeedBitcast) {
    IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
    unsigned AS = Addr->getType()->getPointerAddressSpace();
    Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS));
    NewVal = Builder.CreateBitCast(NewVal, IntTy);
    Loaded = Builder.CreateBitCast(Loaded, IntTy);
  }
  Value* Pair = Builder.CreateAtomicCmpXchg(
      Addr, Loaded, NewVal, MemOpOrder,
      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
  Success = Builder.CreateExtractValue(Pair, 1, "success");
  NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
  if (NeedBitcast)
    NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
 }
 /// Emit IR to implement the given atomicrmw operation on values in registers,
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -4532,6 +4532,24 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
    Results.push_back(CvtVec);
    break;
  }
  case ISD::ATOMIC_SWAP: {
    AtomicSDNode *AM = cast<AtomicSDNode>(Node);
    SDLoc SL(Node);
    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NVT, AM->getVal());
    assert(NVT.getSizeInBits() == OVT.getSizeInBits() &&
           "unexpected promotion type");
    assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() &&
           "unexpected atomic_swap with illegal type");
    SDValue NewAtomic
      = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, NVT,
                      DAG.getVTList(NVT, MVT::Other),
                      { AM->getChain(), AM->getBasePtr(), CastVal },
                      AM->getMemOperand());
    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
    Results.push_back(NewAtomic.getValue(1));
    break;
  }
  }
  // Replace the original node with the legalized result.
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@ -104,6 +104,7 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
    case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
    case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
    case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
    case ISD::SINT_TO_FP:
@ -1932,7 +1933,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
    case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
-
+    case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
  }
  if (R.getNode())
@ -2166,3 +2167,29 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
                                               N->getValueType(0)));
 }
 SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) {
  EVT VT = N->getValueType(0);
  EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
  AtomicSDNode *AM = cast<AtomicSDNode>(N);
  SDLoc SL(N);
  SDValue CastVal = BitConvertToInteger(AM->getVal());
  EVT CastVT = CastVal.getValueType();
  SDValue NewAtomic
    = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, CastVT,
                    DAG.getVTList(CastVT, MVT::Other),
                    { AM->getChain(), AM->getBasePtr(), CastVal },
                    AM->getMemOperand());
  SDValue ResultCast = DAG.getNode(GetPromotionOpcode(VT, NFPVT), SL, NFPVT,
                                   NewAtomic);
  // Legalize the chain result by replacing uses of the old value chain with the
  // new one
  ReplaceValueWith(SDValue(N, 1), NewAtomic.getValue(1));
  return ResultCast;
 }
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -640,6 +640,7 @@ private:
  SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
  SDValue PromoteFloatRes_UnaryOp(SDNode *N);
  SDValue PromoteFloatRes_UNDEF(SDNode *N);
  SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N);
  SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
  bool PromoteFloatOperand(SDNode *N, unsigned OpNo);
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@ -583,6 +583,14 @@ void TargetLoweringBase::initActions() {
  std::fill(std::begin(TargetDAGCombineArray),
            std::end(TargetDAGCombineArray), 0);
  for (MVT VT : MVT::fp_valuetypes()) {
    MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
    if (IntVT.isValid()) {
      setOperationAction(ISD::ATOMIC_SWAP, VT, Promote);
      AddPromotedToType(ISD::ATOMIC_SWAP, VT, IntVT);
    }
  }
  // Set default actions for various operations.
  for (MVT VT : MVT::all_valuetypes()) {
    // Default all indexed load / store to expand.
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@ -3431,10 +3431,17 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
  PointerType *PTy = dyn_cast<PointerType>(RMWI.getOperand(0)->getType());
  Assert(PTy, "First atomicrmw operand must be a pointer.", &RMWI);
  Type *ElTy = PTy->getElementType();
-  Assert(ElTy->isIntegerTy(), "atomicrmw " +
+  if (Op == AtomicRMWInst::Xchg) {
-         AtomicRMWInst::getOperationName(Op) +
+    Assert(ElTy->isIntegerTy() || ElTy->isFloatingPointTy(), "atomicrmw " +
-         " operand must have integer type!",
+           AtomicRMWInst::getOperationName(Op) +
-         &RMWI, ElTy);
+           " operand must have integer or floating point type!",
           &RMWI, ElTy);
  } else {
    Assert(ElTy->isIntegerTy(), "atomicrmw " +
           AtomicRMWInst::getOperationName(Op) +
           " operand must have integer type!",
           &RMWI, ElTy);
  }
  checkAtomicMemAccessSize(ElTy, &RMWI);
  Assert(ElTy == RMWI.getOperand(1)->getType(),
         "Argument value type does not match pointer operand type!", &RMWI,
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -11655,9 +11655,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
  Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
-  return Builder.CreateTruncOrBitCast(
+  Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
-      Builder.CreateCall(Ldxr, Addr),
+
-      cast<PointerType>(Addr->getType())->getElementType());
+  const DataLayout &DL = M->getDataLayout();
  IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
  Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
  return Builder.CreateBitCast(Trunc, EltTy);
 }
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
@ -11692,6 +11696,10 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
  Type *Tys[] = { Addr->getType() };
  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
  const DataLayout &DL = M->getDataLayout();
  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
  Val = Builder.CreateBitCast(Val, IntValTy);
  return Builder.CreateCall(Stxr,
                            {Builder.CreateZExtOrBitCast(
                                 Val, Stxr->getFunctionType()->getParamType(0)),
--- a/llvm/test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll
+++ b/llvm/test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll
@ -0,0 +1,7 @@
 ; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
 ; CHECK: error: atomicrmw xchg operand must be an integer or floating point type
 define void @f(i32** %ptr) {
  atomicrmw xchg i32** %ptr, i32* null seq_cst
  ret void
 }
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@ -761,6 +761,12 @@ define void @atomics(i32* %word) {
  ret void
 }
 define void @fp_atomics(float* %word) {
 ; CHECK: %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.000000e+00 monotonic
  %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.0 monotonic
  ret void
 }
 ;; Fast Math Flags
 define void @fastmathflags_unop(float %op1) {
  %f.nnan = fneg nnan float %op1
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@ -703,6 +703,16 @@ entry:
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
 ; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 ; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
 define amdgpu_kernel void @atomic_xchg_f32_offset(float* %out, float %in) {
 entry:
  %gep = getelementptr float, float* %out, i32 4
  %val = atomicrmw volatile xchg float* %gep, float %in seq_cst
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@ -650,6 +650,15 @@ entry:
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @atomic_xchg_f64_offset(double* %out, double %in) {
 entry:
  %gep = getelementptr double, double* %out, i64 4
  %tmp0 = atomicrmw volatile xchg double* %gep, double %in seq_cst
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@ -839,6 +839,17 @@ entry:
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
 ; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
 ; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
 define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) {
 entry:
  %gep = getelementptr float, float addrspace(1)* %out, i64 4
  %val = atomicrmw volatile xchg float addrspace(1)* %gep, float %in seq_cst
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@ -783,6 +783,17 @@ entry:
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
 ; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
 ; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
 define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) {
 entry:
  %gep = getelementptr double, double addrspace(1)* %out, i64 4
  %tmp0 = atomicrmw volatile xchg double addrspace(1)* %gep, double %in seq_cst
  ret void
 }
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
--- a/llvm/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
@ -36,6 +36,20 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out
  ret void
 }
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_f32_offset:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
 ; EG: LDS_WRXCHG_RET *
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_xchg_ret_f32_offset(float addrspace(1)* %out, float addrspace(3)* %ptr) nounwind {
  %gep = getelementptr float, float addrspace(3)* %ptr, i32 4
  %result = atomicrmw xchg float addrspace(3)* %gep, float 4.0 seq_cst
  store float %result, float addrspace(1)* %out, align 4
  ret void
 }
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
 ; EG: LDS_ADD_RET *
--- a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
@ -27,6 +27,19 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out
  ret void
 }
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_f64_offset:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_xchg_ret_f64_offset(double addrspace(1)* %out, double addrspace(3)* %ptr) nounwind {
  %gep = getelementptr double, double addrspace(3)* %ptr, i32 4
  %result = atomicrmw xchg double addrspace(3)* %gep, double 4.0 seq_cst
  store double %result, double addrspace(1)* %out, align 8
  ret void
 }
 ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
--- a/llvm/test/CodeGen/X86/atomic128.ll
+++ b/llvm/test/CodeGen/X86/atomic128.ll
@ -360,3 +360,27 @@ define void @atomic_store_relaxed(i128* %p, i128 %in) {
   store atomic i128 %in, i128* %p unordered, align 16
   ret void
 }
@fsc128 = external global fp128
 define void @atomic_fetch_swapf128(fp128 %x) nounwind {
 ; CHECK-LABEL: atomic_fetch_swapf128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movq %rsi, %rcx
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    movq _fsc128@{{.*}}(%rip), %rsi
 ; CHECK-NEXT:    movq (%rsi), %rax
 ; CHECK-NEXT:    movq 8(%rsi), %rdx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB14_1: ## %atomicrmw.start
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rsi)
 ; CHECK-NEXT:    jne LBB14_1
 ; CHECK-NEXT:  ## %bb.2: ## %atomicrmw.end
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
  %t1 = atomicrmw xchg fp128* @fsc128, fp128 %x acquire
  ret void
 }
--- a/llvm/test/CodeGen/X86/atomic16.ll
+++ b/llvm/test/CodeGen/X86/atomic16.ll
@ -2,6 +2,7 @@
 ; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32
@sc16 = external global i16
@fsc16 = external global half
 define void @atomic_fetch_add16() nounwind {
 ; X64-LABEL:   atomic_fetch_add16
@ -273,3 +274,14 @@ define void @atomic_fetch_swap16(i16 %x) nounwind {
 ; X64:       ret
 ; X32:       ret
 }
 define void @atomic_fetch_swapf16(half %x) nounwind {
  %t1 = atomicrmw xchg half* @fsc16, half %x acquire
 ; X64-NOT:   lock
 ; X64:       xchgw
 ; X32-NOT:   lock
 ; X32:       xchgw
  ret void
 ; X64:       ret
 ; X32:       ret
 }
--- a/llvm/test/CodeGen/X86/atomic32.ll
+++ b/llvm/test/CodeGen/X86/atomic32.ll
@ -4,6 +4,7 @@
 ; RUN: llc < %s -O0 -mtriple=i686-unknown-unknown -mcpu=corei7 -mattr=-cmov,-sse -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOCMOV
@sc32 = external global i32
@fsc32 = external global float
 define void @atomic_fetch_add32() nounwind {
 ; X64-LABEL: atomic_fetch_add32:
@ -708,3 +709,35 @@ define void @atomic_fetch_swap32(i32 %x) nounwind {
  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
  ret void
 }
 define void @atomic_fetch_swapf32(float %x) nounwind {
 ; X64-LABEL: atomic_fetch_swapf32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movd %xmm0, %eax
 ; X64-NEXT:    xchgl %eax, {{.*}}(%rip)
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    retq
 ;
 ; X86-CMOV-LABEL: atomic_fetch_swapf32:
 ; X86-CMOV:       # %bb.0:
 ; X86-CMOV-NEXT:    pushl %eax
 ; X86-CMOV-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-CMOV-NEXT:    movd %xmm0, %eax
 ; X86-CMOV-NEXT:    xchgl %eax, fsc32
 ; X86-CMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    popl %eax
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_swapf32:
 ; X86-NOCMOV:       # %bb.0:
 ; X86-NOCMOV-NEXT:    subl $8, %esp
 ; X86-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    xchgl %eax, fsc32
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    addl $8, %esp
 ; X86-NOCMOV-NEXT:    retl
  %t1 = atomicrmw xchg float* @fsc32, float %x acquire
  ret void
 }
--- a/llvm/test/CodeGen/X86/atomic64.ll
+++ b/llvm/test/CodeGen/X86/atomic64.ll
@ -1,6 +1,7 @@
 ; RUN: llc < %s -O0 -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64
@sc64 = external global i64
@fsc64 = external global double
 define void @atomic_fetch_add64() nounwind {
 ; X64-LABEL:   atomic_fetch_add64:
@ -233,3 +234,16 @@ define void @atomic_fetch_swap64(i64 %x) nounwind {
 ; X64:       ret
 ; X32:       ret
 }
 define void @atomic_fetch_swapf64(double %x) nounwind {
 ; X64-LABEL:   atomic_fetch_swapf64:
 ; X32-LABEL:   atomic_fetch_swapf64:
  %t1 = atomicrmw xchg double* @fsc64, double %x acquire
 ; X64-NOT:   lock
 ; X64:       xchgq
 ; X32:       lock
 ; X32:       xchg8b
  ret void
 ; X64:       ret
 ; X32:       ret
 }
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@ -0,0 +1,57 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=aarch64-- -atomic-expand %s | FileCheck %s
 define void @atomic_swap_f16(half* %ptr, half %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f16(
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f16(half* [[PTR:%.*]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i16
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[VAL:%.*]] to i16
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0f16(i64 [[TMP5]], half* [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
  %t1 = atomicrmw xchg half* %ptr, half %val acquire
  ret void
 }
 define void @atomic_swap_f32(float* %ptr, float %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f32(
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f32(float* [[PTR:%.*]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[VAL:%.*]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0f32(i64 [[TMP5]], float* [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
  %t1 = atomicrmw xchg float* %ptr, float %val acquire
  ret void
 }
 define void @atomic_swap_f64(double* %ptr, double %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f64(
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f64(double* [[PTR:%.*]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[VAL:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.aarch64.stxr.p0f64(i64 [[TMP3]], double* [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
  %t1 = atomicrmw xchg double* %ptr, double %val acquire
  ret void
 }
--- a/llvm/test/Transforms/AtomicExpand/AArch64/lit.local.cfg
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/lit.local.cfg
@ -0,0 +1,3 @@
 if not 'AArch64' in config.root.targets:
    config.unsupported = True
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
@ -0,0 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=i686-linux-gnu -atomic-expand %s | FileCheck %s
 define double @atomic_xchg_f64(double* %ptr) nounwind {
 ; CHECK-LABEL: @atomic_xchg_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[PTR]] to i64*
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP3]], i64 4616189618054758400 seq_cst seq_cst
 ; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
 ; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
  %result = atomicrmw xchg double* %ptr, double 4.0 seq_cst
  ret double %result
 }
 define double @atomic_xchg_f64_as1(double addrspace(1)* %ptr) nounwind {
 ; CHECK-LABEL: @atomic_xchg_f64_as1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)*
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP3]], i64 4616189618054758400 seq_cst seq_cst
 ; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
 ; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
  %result = atomicrmw xchg double addrspace(1)* %ptr, double 4.0 seq_cst
  ret double %result
 }
		`@ -0,0 +1,3 @@`
							`if not 'AArch64' in config.root.targets:`
							`config.unsupported = True`