[CUDA][HIP] Fix device variable linkage

For -fgpu-rdc, shadow variables should not be internalized, otherwise
they cannot be accessed by other TUs. This is necessary because
the shadow variable of external device variables are always
emitted as undefined symbols, which need to resolve to a global
symbols.

Managed variables need to be emitted as undefined symbols
in device compilations.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D95901
This commit is contained in:
Yaxun (Sam) Liu 2021-02-02 18:06:33 -05:00
parent c981f6f8e1
commit b008ea304d
6 changed files with 166 additions and 44 deletions

View File

@ -11435,16 +11435,22 @@ operator<<(const StreamingDiagnostic &DB,
}
bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
return !getLangOpts().GPURelocatableDeviceCode &&
((D->hasAttr<CUDADeviceAttr>() &&
bool IsStaticVar =
isa<VarDecl>(D) && cast<VarDecl>(D)->getStorageClass() == SC_Static;
bool IsExplicitDeviceVar = (D->hasAttr<CUDADeviceAttr>() &&
!D->getAttr<CUDADeviceAttr>()->isImplicit()) ||
(D->hasAttr<CUDAConstantAttr>() &&
!D->getAttr<CUDAConstantAttr>()->isImplicit())) &&
isa<VarDecl>(D) && cast<VarDecl>(D)->isFileVarDecl() &&
cast<VarDecl>(D)->getStorageClass() == SC_Static;
!D->getAttr<CUDAConstantAttr>()->isImplicit());
// CUDA/HIP: static managed variables need to be externalized since it is
// a declaration in IR, therefore cannot have internal linkage.
// ToDo: externalize static variables for -fgpu-rdc.
return IsStaticVar &&
(D->hasAttr<HIPManagedAttr>() ||
(!getLangOpts().GPURelocatableDeviceCode && IsExplicitDeviceVar));
}
bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
return mayExternalizeStaticVar(D) &&
CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D));
(D->hasAttr<HIPManagedAttr>() ||
CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D)));
}

View File

@ -546,6 +546,8 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
/*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
llvm::GlobalVariable::NotThreadLocal);
ManagedVar->setDSOLocal(Var->isDSOLocal());
ManagedVar->setVisibility(Var->getVisibility());
replaceManagedVar(Var, ManagedVar);
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
@ -932,11 +934,16 @@ CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
void CGNVCUDARuntime::internalizeDeviceSideVar(
const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
// Host-side shadows of external declarations of device-side
// global variables become internal definitions. These have to
// be internal in order to prevent name conflicts with global
// host variables with the same name in a different TUs.
// For -fno-gpu-rdc, host-side shadows of external declarations of device-side
// global variables become internal definitions. These have to be internal in
// order to prevent name conflicts with global host variables with the same
// name in a different TUs.
//
// For -fgpu-rdc, the shadow variables should not be internalized because
// they may be accessed by different TU.
if (CGM.getLangOpts().GPURelocatableDeviceCode)
return;
// __shared__ variables are odd. Shadows do get created, but
// they are not registered with the CUDA runtime, so they
// can't really be used to access their device-side

View File

@ -4169,8 +4169,12 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
bool NeedsGlobalDtor =
D->needsDestruction(getContext()) == QualType::DK_cxx_destructor;
bool IsHIPManagedVarOnDevice =
getLangOpts().CUDAIsDevice && D->hasAttr<HIPManagedAttr>();
const VarDecl *InitDecl;
const Expr *InitExpr = D->getAnyInitializer(InitDecl);
const Expr *InitExpr =
IsHIPManagedVarOnDevice ? nullptr : D->getAnyInitializer(InitDecl);
Optional<ConstantEmitter> emitter;
@ -4190,8 +4194,6 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
(D->getType()->isCUDADeviceBuiltinSurfaceType() ||
D->getType()->isCUDADeviceBuiltinTextureType() ||
D->hasAttr<HIPManagedAttr>());
// HIP pinned shadow of initialized host-side global variables are also
// left undefined.
if (getLangOpts().CUDA &&
(IsCUDASharedVar || IsCUDAShadowVar || IsCUDADeviceShadowVar))
Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
@ -4302,6 +4304,9 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
}
}
// HIP managed variables need to be emitted as declarations in device
// compilation.
if (!IsHIPManagedVarOnDevice)
GV->setInitializer(Init);
if (emitter)
emitter->finalize(GV);

View File

@ -30,9 +30,13 @@
// RUN: | FileCheck %s -allow-deprecated-dag-overlap \
// RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck %s -allow-deprecated-dag-overlap \
// RUN: --check-prefixes=ALL,LNX,NORDC,CUDA,CUDANORDC,CUDA-NEW,LNX_17,NORDC17
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
// RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck %s -allow-deprecated-dag-overlap \
// RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17
// RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17,RDC17
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -target-sdk-version=9.2 -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
@ -45,7 +49,7 @@
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,NORDC,HIP,HIPEF
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF
@ -56,15 +60,18 @@
#include "Inputs/cuda.h"
#ifndef NOGLOBALS
// LNX-DAG: @device_var = internal global i32
// NORDC-DAG: @device_var = internal global i32
// RDC-DAG: @device_var = dso_local global i32
// WIN-DAG: @"?device_var@@3HA" = internal global i32
__device__ int device_var;
// LNX-DAG: @constant_var = internal global i32
// NORDC-DAG: @constant_var = internal global i32
// RDC-DAG: @constant_var = dso_local global i32
// WIN-DAG: @"?constant_var@@3HA" = internal global i32
__constant__ int constant_var;
// LNX-DAG: @shared_var = internal global i32
// NORDC-DAG: @shared_var = internal global i32
// RDC-DAG: @shared_var = dso_local global i32
// WIN-DAG: @"?shared_var@@3HA" = internal global i32
__shared__ int shared_var;
@ -87,18 +94,21 @@ extern __constant__ int ext_constant_var;
// external device-side variables with definitions should generate
// definitions for the shadows.
// LNX-DAG: @ext_device_var_def = internal global i32 undef,
// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
// WIN-DAG: @"?ext_device_var_def@@3HA" = internal global i32 undef
extern __device__ int ext_device_var_def;
__device__ int ext_device_var_def = 1;
// LNX-DAG: @ext_device_var_def = internal global i32 undef,
// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
// WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef
__constant__ int ext_constant_var_def = 2;
#if __cplusplus > 201402L
/// FIXME: Reject __device__ constexpr and inline variables in Sema.
// LNX_17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
// LNX_17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
// NORDC17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
// RDC17: @inline_var = linkonce_odr global i32 undef, comdat, align 4{{$}}
// NORDC17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
// RDC17: @_ZN1C17member_inline_varE = linkonce_odr constant i32 undef, comdat, align 4{{$}}
__device__ inline int inline_var = 3;
struct C {
__device__ static constexpr int member_inline_var = 4;
@ -151,13 +161,13 @@ void use_pointers() {
// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
// * constant unnamed string with NVModuleID
// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
// CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
// * Make sure our constructor was added to global ctor list.
// LNX: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
// * Alias to global symbol containing the NVModuleID.
// RDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
// RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
// CUDARDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
// CUDARDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
// Test that we build the correct number of calls to cudaSetupArgument followed
// by a call to cudaLaunch.
@ -214,25 +224,33 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
// HIP-NEXT: icmp eq i8** {{.*}}, null
// HIP-NEXT: br i1 {{.*}}, label %if, label %exit
// HIP: if:
// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
// CUDANORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
// .. stores return value in __[[PREFIX]]_gpubin_handle
// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
// CUDANORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
// .. and then calls __[[PREFIX]]_register_globals
// HIP: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
// .. stores return value in __[[PREFIX]]_gpubin_handle
// HIP-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
// .. and then calls __[[PREFIX]]_register_globals
// HIP-NEXT: br label %exit
// HIP: exit:
// HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle
// NORDC-NEXT: call void @__[[PREFIX]]_register_globals
// CUDANORDC-NEXT: call void @__[[PREFIX]]_register_globals
// HIP-NEXT: call void @__[[PREFIX]]_register_globals
// * In separate mode we also register a destructor.
// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
// CUDANORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
// HIP-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
// With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%
// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
// RDC-SAME: [[MODULE_ID_GLOBAL]]
// CUDARDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
// CUDARDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
// CUDARDC-SAME: [[MODULE_ID_GLOBAL]]
// Test that we've created destructor.
// NORDC: define internal void @__[[PREFIX]]_module_dtor
// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
// CUDANORDC: define internal void @__[[PREFIX]]_module_dtor
// HIP: define internal void @__[[PREFIX]]_module_dtor
// CUDANORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
// HIP: load{{.*}}__[[PREFIX]]_gpubin_handle
// CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
// HIP-NEXT: icmp ne i8** {{.*}}, null
// HIP-NEXT: br i1 {{.*}}, label %if, label %exit

View File

@ -0,0 +1,65 @@
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,RDC %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
#include "Inputs/cuda.h"
// DEV-DAG: @v1 = dso_local addrspace(1) externally_initialized global i32 0
// NORDC-H-DAG: @v1 = internal global i32 undef
// RDC-H-DAG: @v1 = dso_local global i32 undef
__device__ int v1;
// DEV-DAG: @v2 = dso_local addrspace(4) externally_initialized global i32 0
// NORDC-H-DAG: @v2 = internal global i32 undef
// RDC-H-DAG: @v2 = dso_local global i32 undef
__constant__ int v2;
// DEV-DAG: @v3 = external addrspace(1) externally_initialized global i32
// NORDC-H-DAG: @v3 = internal global i32 0
// RDC-H-DAG: @v3 = dso_local global i32 0
__managed__ int v3;
// DEV-DAG: @ev1 = external addrspace(1) global i32
// HOST-DAG: @ev1 = external global i32
extern __device__ int ev1;
// DEV-DAG: @ev2 = external addrspace(4) global i32
// HOST-DAG: @ev2 = external global i32
extern __constant__ int ev2;
// DEV-DAG: @ev3 = external addrspace(1) global i32
// HOST-DAG: @ev3 = external global i32
extern __managed__ int ev3;
// NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
static __device__ int sv1;
// NORDC-DAG: @_ZL3sv2 = dso_local addrspace(4) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
static __constant__ int sv2;
// DEV-DAG: @_ZL3sv3 = external addrspace(1) externally_initialized global i32
// HOST-DAG: @_ZL3sv3 = internal global i32 0
static __managed__ int sv3;
__device__ __host__ int work(int *x);
__device__ __host__ int fun1() {
return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3);
}
// HOST: hipRegisterVar({{.*}}@v1
// HOST: hipRegisterVar({{.*}}@v2
// HOST: hipRegisterManagedVar({{.*}}@v3
// HOST-NOT: hipRegisterVar({{.*}}@ev1
// HOST-NOT: hipRegisterVar({{.*}}@ev2
// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
// HOST: hipRegisterVar({{.*}}@_ZL3sv1
// HOST: hipRegisterVar({{.*}}@_ZL3sv2
// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3

View File

@ -10,17 +10,19 @@
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=HOST %s
// RUN: -check-prefixes=HOST,NORDC %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=HOST %s
// RUN: -check-prefixes=HOST,RDC %s
#include "Inputs/cuda.h"
// DEV-DAG: @x = {{.*}}addrspace(1) externally_initialized global i32 undef
// HOST-DAG: @x = internal global i32 1
// HOST-DAG: @x.managed = internal global i32* null
// DEV-DAG: @x = external addrspace(1) externally_initialized global i32
// NORDC-DAG: @x = internal global i32 1
// RDC-DAG: @x = dso_local global i32 1
// NORDC-DAG: @x.managed = internal global i32* null
// RDC-DAG: @x.managed = dso_local global i32* null
// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00"
struct vec {
@ -31,11 +33,28 @@ __managed__ int x = 1;
__managed__ vec v[100];
__managed__ vec v2[100] = {{1, 1, 1}};
// DEV-DAG: @ex = external addrspace(1) global i32
// HOST-DAG: @ex = external global i32
extern __managed__ int ex;
// DEV-DAG: @_ZL2sx = external addrspace(1) externally_initialized global i32
// HOST-DAG: @_ZL2sx = internal global i32 1
// HOST-DAG: @_ZL2sx.managed = internal global i32* null
static __managed__ int sx = 1;
// HOST-NOT: @ex.managed
// Force ex and sx mitted in device compilation.
__global__ void foo(int *z) {
*z = x;
*z = x + ex + sx;
v[1].x = 2;
}
// Force ex and sx emitted in host compilatioin.
int foo2() {
return ex + sx;
}
// HOST-LABEL: define {{.*}}@_Z4loadv()
// HOST: %ld.managed = load i32*, i32** @x.managed, align 4
// HOST: %0 = load i32, i32* %ld.managed, align 4
@ -97,4 +116,6 @@ float addr_taken2() {
}
// HOST-DAG: __hipRegisterManagedVar({{.*}}@x.managed {{.*}}@x {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4)
// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx.managed {{.*}}@_ZL2sx
// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex.managed {{.*}}@ex
// HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32)