[NVPTX] Implement __nvvm_atom_add_gen_d builtin.

Summary:
This just seems to have been an oversight.  We already supported the f64
atomic add with an explicit scope (e.g. "cta"), but not the scopeless
version.

Reviewers: tra

Subscribers: jholewinski, sanjoy, cfe-commits, llvm-commits, hiraditya

Differential Revision: https://reviews.llvm.org/D39638

llvm-svn: 317623
This commit is contained in:
Justin Lebar 2017-11-07 22:10:54 +00:00
parent 3ae8dfda06
commit da9e0bd3a2
7 changed files with 73 additions and 2 deletions

View File

@ -481,7 +481,7 @@ TARGET_BUILTIN(__nvvm_atom_cta_add_gen_f, "ffD*f", "n", "satom")
TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", "satom")
BUILTIN(__nvvm_atom_add_g_d, "ddD*1d", "n")
BUILTIN(__nvvm_atom_add_s_d, "ddD*3d", "n")
BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n")
TARGET_BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n", "satom")
TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", "satom")
TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", "satom")

View File

@ -9554,6 +9554,16 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(FnALAF32, {Ptr, Val});
}
case NVPTX::BI__nvvm_atom_add_gen_d: {
Value *Ptr = EmitScalarExpr(E->getArg(0));
Value *Val = EmitScalarExpr(E->getArg(1));
// atomicrmw only deals with integer arguments, so we need to use
// LLVM's nvvm_atomic_load_add_f64 intrinsic.
Value *FnALAF64 =
CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f64, Ptr->getType());
return Builder.CreateCall(FnALAF64, {Ptr, Val});
}
case NVPTX::BI__nvvm_atom_inc_gen_ui: {
Value *Ptr = EmitScalarExpr(E->getArg(0));
Value *Val = EmitScalarExpr(E->getArg(1));

View File

@ -0,0 +1,23 @@
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK %s
//
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 \
// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))
// We have to keep all builtins that depend on particular target feature in the
// same function, because the codegen will stop after the very first function
// that encounters an error, so -verify will not be able to find errors in
// subsequent functions.
// CHECK-LABEL: test_fn
__device__ void test_fn(double d, double* double_ptr) {
// CHECK: call double @llvm.nvvm.atomic.load.add.f64.p0f64
// expected-error@+1 {{'__nvvm_atom_add_gen_d' needs target feature satom}}
__nvvm_atom_add_gen_d(double_ptr, d);
}

View File

@ -683,10 +683,15 @@ let TargetPrefix = "nvvm" in {
Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
// Atomic not available as an llvm intrinsic.
// Atomics not available as llvm intrinsics.
def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty],
[LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty],
[IntrArgMemOnly, NoCapture<0>]>;
// Atomic add of f64 requires sm_60.
def int_nvvm_atomic_load_add_f64 : Intrinsic<[llvm_double_ty],
[LLVMAnyPointerType<llvm_double_ty>, llvm_double_ty],
[IntrArgMemOnly, NoCapture<0>]>;
def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
[LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
[IntrArgMemOnly, NoCapture<0>]>;

View File

@ -3449,6 +3449,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
}
case Intrinsic::nvvm_atomic_load_add_f32:
case Intrinsic::nvvm_atomic_load_add_f64:
case Intrinsic::nvvm_atomic_load_inc_32:
case Intrinsic::nvvm_atomic_load_dec_32:

View File

@ -1095,6 +1095,12 @@ def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
(int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
@ -1121,6 +1127,13 @@ defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add",
atomic_load_add_f64_g, f64imm, fpimm, hasAtomAddF64>;
defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add",
atomic_load_add_f64_s, f64imm, fpimm, hasAtomAddF64>;
defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add",
atomic_load_add_f64_gen, f64imm, fpimm, hasAtomAddF64>;
// atom_sub
def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),

View File

@ -0,0 +1,19 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
; CHECK-LABEL .func test(
define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) {
; CHECK: atom.add.f64
%r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d)
; CHECK: atom.global.add.f64
%r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d)
; CHECK: atom.shared.add.f64
%ret = call double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* %dp3, double %d)
ret void
}
declare double @llvm.nvvm.atomic.load.add.f64.p0f64(double* nocapture, double) #1
declare double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* nocapture, double) #1
declare double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* nocapture, double) #1
attributes #1 = { argmemonly nounwind }