From da9e0bd3a238f80b05a3070c8a8a3cadadc12e5f Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 7 Nov 2017 22:10:54 +0000 Subject: [PATCH] [NVPTX] Implement __nvvm_atom_add_gen_d builtin. Summary: This just seems to have been an oversight. We already supported the f64 atomic add with an explicit scope (e.g. "cta"), but not the scopeless version. Reviewers: tra Subscribers: jholewinski, sanjoy, cfe-commits, llvm-commits, hiraditya Differential Revision: https://reviews.llvm.org/D39638 llvm-svn: 317623 --- clang/include/clang/Basic/BuiltinsNVPTX.def | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 10 +++++++++ clang/test/CodeGen/builtins-nvptx-ptx50.cu | 23 +++++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 7 ++++++- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 13 ++++++++++++ llvm/test/CodeGen/NVPTX/atomics-sm60.ll | 19 +++++++++++++++++ 7 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/builtins-nvptx-ptx50.cu create mode 100644 llvm/test/CodeGen/NVPTX/atomics-sm60.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index caa860480f7e..b596793c9c12 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -481,7 +481,7 @@ TARGET_BUILTIN(__nvvm_atom_cta_add_gen_f, "ffD*f", "n", "satom") TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", "satom") BUILTIN(__nvvm_atom_add_g_d, "ddD*1d", "n") BUILTIN(__nvvm_atom_add_s_d, "ddD*3d", "n") -BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n") +TARGET_BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n", "satom") TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", "satom") TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", "satom") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f1b8e2e61109..369240f316c2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9554,6 +9554,16 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(FnALAF32, {Ptr, Val}); } + case NVPTX::BI__nvvm_atom_add_gen_d: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Value *Val = EmitScalarExpr(E->getArg(1)); + // atomicrmw only deals with integer arguments, so we need to use + // LLVM's nvvm_atomic_load_add_f64 intrinsic. + Value *FnALAF64 = + CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_add_f64, Ptr->getType()); + return Builder.CreateCall(FnALAF64, {Ptr, Val}); + } + case NVPTX::BI__nvvm_atom_inc_gen_ui: { Value *Ptr = EmitScalarExpr(E->getArg(0)); Value *Val = EmitScalarExpr(E->getArg(1)); diff --git a/clang/test/CodeGen/builtins-nvptx-ptx50.cu b/clang/test/CodeGen/builtins-nvptx-ptx50.cu new file mode 100644 index 000000000000..e85be442eb47 --- /dev/null +++ b/clang/test/CodeGen/builtins-nvptx-ptx50.cu @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \ +// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK %s +// +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 \ +// RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __shared__ __attribute__((shared)) +#define __constant__ __attribute__((constant)) + +// We have to keep all builtins that depend on particular target feature in the +// same function, because the codegen will stop after the very first function +// that encounters an error, so -verify will not be able to find errors in +// subsequent functions. + +// CHECK-LABEL: test_fn +__device__ void test_fn(double d, double* double_ptr) { + // CHECK: call double @llvm.nvvm.atomic.load.add.f64.p0f64 + // expected-error@+1 {{'__nvvm_atom_add_gen_d' needs target feature satom}} + __nvvm_atom_add_gen_d(double_ptr, d); +} diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7ba1a3eb2e5b..249419d15d3f 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -683,10 +683,15 @@ let TargetPrefix = "nvvm" in { Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>; -// Atomic not available as an llvm intrinsic. +// Atomics not available as llvm intrinsics. def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty], [LLVMAnyPointerType, llvm_float_ty], [IntrArgMemOnly, NoCapture<0>]>; + // Atomic add of f64 requires sm_60. + def int_nvvm_atomic_load_add_f64 : Intrinsic<[llvm_double_ty], + [LLVMAnyPointerType, llvm_double_ty], + [IntrArgMemOnly, NoCapture<0>]>; + def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty], [LLVMAnyPointerType, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7b9acb20b759..ac4f2544fc34 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3449,6 +3449,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( } case Intrinsic::nvvm_atomic_load_add_f32: + case Intrinsic::nvvm_atomic_load_add_f64: case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f745b6f66353..478f3e9d0577 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1095,6 +1095,12 @@ def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>; +def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; +def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; +def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), + (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>; defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2; @@ -1121,6 +1127,13 @@ defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2; +defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2; +defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2; +defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2; + // atom_sub def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll new file mode 100644 index 000000000000..0b5bafb780c5 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s + +; CHECK-LABEL .func test( +define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) { +; CHECK: atom.add.f64 + %r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d) +; CHECK: atom.global.add.f64 + %r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d) +; CHECK: atom.shared.add.f64 + %ret = call double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* %dp3, double %d) + ret void +} + +declare double @llvm.nvvm.atomic.load.add.f64.p0f64(double* nocapture, double) #1 +declare double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* nocapture, double) #1 +declare double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* nocapture, double) #1 + +attributes #1 = { argmemonly nounwind }