forked from OSchip/llvm-project
[NVPTX] Implement __nvvm_atom_add_gen_d builtin.
Summary: This just seems to have been an oversight. We already supported the f64 atomic add with an explicit scope (e.g. "cta"), but not the scopeless version. Reviewers: tra Subscribers: jholewinski, sanjoy, cfe-commits, llvm-commits, hiraditya Differential Revision: https://reviews.llvm.org/D39638 llvm-svn: 317623
This commit is contained in:
parent
3ae8dfda06
commit
da9e0bd3a2
|
@ -481,7 +481,7 @@ TARGET_BUILTIN(__nvvm_atom_cta_add_gen_f, "ffD*f", "n", "satom")
|
||||||
TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", "satom")
|
TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", "satom")
|
||||||
BUILTIN(__nvvm_atom_add_g_d, "ddD*1d", "n")
|
BUILTIN(__nvvm_atom_add_g_d, "ddD*1d", "n")
|
||||||
BUILTIN(__nvvm_atom_add_s_d, "ddD*3d", "n")
|
BUILTIN(__nvvm_atom_add_s_d, "ddD*3d", "n")
|
||||||
BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n")
|
TARGET_BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n", "satom")
|
||||||
TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", "satom")
|
TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", "satom")
|
||||||
TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", "satom")
|
TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", "satom")
|
||||||
|
|
||||||
|
|
|
@ -9554,6 +9554,16 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
|
||||||
return Builder.CreateCall(FnALAF32, {Ptr, Val});
|
return Builder.CreateCall(FnALAF32, {Ptr, Val});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case NVPTX::BI__nvvm_atom_add_gen_d: {
|
||||||
|
Value *Ptr = EmitScalarExpr(E->getArg(0));
|
||||||
|
Value *Val = EmitScalarExpr(E->getArg(1));
|
||||||
|
// atomicrmw only deals with integer arguments, so we need to use
|
||||||
|
// LLVM's nvvm_atomic_load_add_f64 intrinsic.
|
||||||
|
Value *FnALAF64 =
|
||||||
|
CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f64, Ptr->getType());
|
||||||
|
return Builder.CreateCall(FnALAF64, {Ptr, Val});
|
||||||
|
}
|
||||||
|
|
||||||
case NVPTX::BI__nvvm_atom_inc_gen_ui: {
|
case NVPTX::BI__nvvm_atom_inc_gen_ui: {
|
||||||
Value *Ptr = EmitScalarExpr(E->getArg(0));
|
Value *Ptr = EmitScalarExpr(E->getArg(0));
|
||||||
Value *Val = EmitScalarExpr(E->getArg(1));
|
Value *Val = EmitScalarExpr(E->getArg(1));
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
|
||||||
|
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
|
||||||
|
// RUN: | FileCheck -check-prefix=CHECK %s
|
||||||
|
//
|
||||||
|
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 \
|
||||||
|
// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s
|
||||||
|
|
||||||
|
#define __device__ __attribute__((device))
|
||||||
|
#define __global__ __attribute__((global))
|
||||||
|
#define __shared__ __attribute__((shared))
|
||||||
|
#define __constant__ __attribute__((constant))
|
||||||
|
|
||||||
|
// We have to keep all builtins that depend on particular target feature in the
|
||||||
|
// same function, because the codegen will stop after the very first function
|
||||||
|
// that encounters an error, so -verify will not be able to find errors in
|
||||||
|
// subsequent functions.
|
||||||
|
|
||||||
|
// CHECK-LABEL: test_fn
|
||||||
|
__device__ void test_fn(double d, double* double_ptr) {
|
||||||
|
// CHECK: call double @llvm.nvvm.atomic.load.add.f64.p0f64
|
||||||
|
// expected-error@+1 {{'__nvvm_atom_add_gen_d' needs target feature satom}}
|
||||||
|
__nvvm_atom_add_gen_d(double_ptr, d);
|
||||||
|
}
|
|
@ -683,10 +683,15 @@ let TargetPrefix = "nvvm" in {
|
||||||
Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
|
Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
|
||||||
|
|
||||||
|
|
||||||
// Atomic not available as an llvm intrinsic.
|
// Atomics not available as llvm intrinsics.
|
||||||
def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty],
|
def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty],
|
||||||
[LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty],
|
[LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty],
|
||||||
[IntrArgMemOnly, NoCapture<0>]>;
|
[IntrArgMemOnly, NoCapture<0>]>;
|
||||||
|
// Atomic add of f64 requires sm_60.
|
||||||
|
def int_nvvm_atomic_load_add_f64 : Intrinsic<[llvm_double_ty],
|
||||||
|
[LLVMAnyPointerType<llvm_double_ty>, llvm_double_ty],
|
||||||
|
[IntrArgMemOnly, NoCapture<0>]>;
|
||||||
|
|
||||||
def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
|
def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
|
||||||
[LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
|
[LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
|
||||||
[IntrArgMemOnly, NoCapture<0>]>;
|
[IntrArgMemOnly, NoCapture<0>]>;
|
||||||
|
|
|
@ -3449,6 +3449,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
|
||||||
}
|
}
|
||||||
|
|
||||||
case Intrinsic::nvvm_atomic_load_add_f32:
|
case Intrinsic::nvvm_atomic_load_add_f32:
|
||||||
|
case Intrinsic::nvvm_atomic_load_add_f64:
|
||||||
case Intrinsic::nvvm_atomic_load_inc_32:
|
case Intrinsic::nvvm_atomic_load_inc_32:
|
||||||
case Intrinsic::nvvm_atomic_load_dec_32:
|
case Intrinsic::nvvm_atomic_load_dec_32:
|
||||||
|
|
||||||
|
|
|
@ -1095,6 +1095,12 @@ def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
|
||||||
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
|
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
|
||||||
def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
|
def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
|
||||||
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
|
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
|
||||||
|
def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
|
||||||
|
(int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
|
||||||
|
def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
|
||||||
|
(int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
|
||||||
|
def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
|
||||||
|
(int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
|
||||||
|
|
||||||
defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
|
defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
|
||||||
atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
|
atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
|
||||||
|
@ -1121,6 +1127,13 @@ defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
|
||||||
defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
|
defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
|
||||||
atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
|
atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
|
||||||
|
|
||||||
|
defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add",
|
||||||
|
atomic_load_add_f64_g, f64imm, fpimm, hasAtomAddF64>;
|
||||||
|
defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add",
|
||||||
|
atomic_load_add_f64_s, f64imm, fpimm, hasAtomAddF64>;
|
||||||
|
defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add",
|
||||||
|
atomic_load_add_f64_gen, f64imm, fpimm, hasAtomAddF64>;
|
||||||
|
|
||||||
// atom_sub
|
// atom_sub
|
||||||
|
|
||||||
def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
|
def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
|
||||||
|
; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
|
||||||
|
|
||||||
|
; CHECK-LABEL .func test(
|
||||||
|
define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) {
|
||||||
|
; CHECK: atom.add.f64
|
||||||
|
%r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d)
|
||||||
|
; CHECK: atom.global.add.f64
|
||||||
|
%r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d)
|
||||||
|
; CHECK: atom.shared.add.f64
|
||||||
|
%ret = call double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* %dp3, double %d)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare double @llvm.nvvm.atomic.load.add.f64.p0f64(double* nocapture, double) #1
|
||||||
|
declare double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* nocapture, double) #1
|
||||||
|
declare double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* nocapture, double) #1
|
||||||
|
|
||||||
|
attributes #1 = { argmemonly nounwind }
|
Loading…
Reference in New Issue