[CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

For -fgpu-rdc mode, static device vars in different TU's may have the same name.
To support accessing file-scope static device variables in host code, we need to give them
a distinct name and external linkage. This can be done by postfixing each static device variable with
a distinct CUID (Compilation Unit ID) hash.

Since the static device variables have different name across compilation units, now we let
them have external linkage so that they can be looked up by the runtime.

Reviewed by: Artem Belevich, and Jon Chesterfield

Differential Revision: https://reviews.llvm.org/D85223
This commit is contained in:
Yaxun (Sam) Liu 2021-01-19 17:36:58 -05:00
parent c519460745
commit 47acdec1dd
9 changed files with 229 additions and 20 deletions

View File

@ -299,6 +299,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// This is lazily created. This is intentionally not serialized.
mutable llvm::StringMap<StringLiteral *> StringLiteralCache;
/// MD5 hash of CUID. It is calculated when first used and cached by this
/// data member.
mutable std::string CUIDHash;
/// Representation of a "canonical" template template parameter that
/// is used in canonical template names.
class CanonicalTemplateTemplateParm : public llvm::FoldingSetNode {
@ -3117,6 +3121,8 @@ public:
/// Whether a C++ static variable should be externalized.
bool shouldExternalizeStaticVar(const Decl *D) const;
StringRef getCUIDHash() const;
private:
/// All OMPTraitInfo objects live in this collection, one per
/// `pragma omp [begin] declare variant` directive.

View File

@ -84,6 +84,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@ -10645,7 +10646,10 @@ static GVALinkage adjustGVALinkageForAttributes(const ASTContext &Context,
return GVA_StrongODR;
// Single source offloading languages like CUDA/HIP need to be able to
// access static device variables from host code of the same compilation
// unit. This is done by externalizing the static variable.
// unit. This is done by externalizing the static variable with a shared
// name between the host and device compilation which is the same for the
// same compilation unit whereas different among different compilation
// units.
if (Context.shouldExternalizeStaticVar(D))
return GVA_StrongExternal;
}
@ -11533,10 +11537,8 @@ bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
!D->getAttr<CUDAConstantAttr>()->isImplicit());
// CUDA/HIP: static managed variables need to be externalized since it is
// a declaration in IR, therefore cannot have internal linkage.
// ToDo: externalize static variables for -fgpu-rdc.
return IsStaticVar &&
(D->hasAttr<HIPManagedAttr>() ||
(!getLangOpts().GPURelocatableDeviceCode && IsExplicitDeviceVar));
(D->hasAttr<HIPManagedAttr>() || IsExplicitDeviceVar);
}
bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
@ -11544,3 +11546,12 @@ bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
(D->hasAttr<HIPManagedAttr>() ||
CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D)));
}
StringRef ASTContext::getCUIDHash() const {
if (!CUIDHash.empty())
return CUIDHash;
if (LangOpts.CUID.empty())
return StringRef();
CUIDHash = llvm::utohexstr(llvm::MD5Hash(LangOpts.CUID), /*LowerCase=*/true);
return CUIDHash;
}

View File

@ -255,6 +255,17 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
DeviceSideName = std::string(Out.str());
} else
DeviceSideName = std::string(ND->getIdentifier()->getName());
// Make unique name for device side static file-scope variable for HIP.
if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
CGM.getLangOpts().GPURelocatableDeviceCode &&
!CGM.getLangOpts().CUID.empty()) {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << DeviceSideName;
CGM.printPostfixForExternalizedStaticVar(Out);
DeviceSideName = std::string(Out.str());
}
return DeviceSideName;
}

View File

@ -1184,6 +1184,11 @@ static std::string getMangledNameImpl(const CodeGenModule &CGM, GlobalDecl GD,
}
}
// Make unique name for device side static file-scope variable for HIP.
if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
CGM.getLangOpts().GPURelocatableDeviceCode &&
CGM.getLangOpts().CUDAIsDevice && !CGM.getLangOpts().CUID.empty())
CGM.printPostfixForExternalizedStaticVar(Out);
return std::string(Out.str());
}
@ -1241,9 +1246,16 @@ StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
}
}
auto FoundName = MangledDeclNames.find(CanonicalGD);
if (FoundName != MangledDeclNames.end())
return FoundName->second;
// In CUDA/HIP device compilation with -fgpu-rdc, the mangled name of a
// static device variable depends on whether the variable is referenced by
// a host or device host function. Therefore the mangled name cannot be
// cached.
if (!LangOpts.CUDAIsDevice ||
!getContext().mayExternalizeStaticVar(GD.getDecl())) {
auto FoundName = MangledDeclNames.find(CanonicalGD);
if (FoundName != MangledDeclNames.end())
return FoundName->second;
}
// Keep the first result in the case of a mangling collision.
const auto *ND = cast<NamedDecl>(GD.getDecl());
@ -6249,3 +6261,8 @@ bool CodeGenModule::stopAutoInit() {
}
return false;
}
void CodeGenModule::printPostfixForExternalizedStaticVar(
llvm::raw_ostream &OS) const {
OS << ".static." << getContext().getCUIDHash();
}

View File

@ -1422,6 +1422,10 @@ public:
TBAAAccessInfo *TBAAInfo = nullptr);
bool stopAutoInit();
/// Print the postfix for externalized static variable for single source
/// offloading languages CUDA and HIP.
void printPostfixForExternalizedStaticVar(llvm::raw_ostream &OS) const;
private:
llvm::Constant *GetOrCreateLLVMFunction(
StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable,

View File

@ -2,13 +2,13 @@
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,RDC %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
#include "Inputs/cuda.h"
@ -37,14 +37,15 @@ extern __constant__ int ev2;
extern __managed__ int ev3;
// NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
static __device__ int sv1;
// NORDC-DAG: @_ZL3sv2 = dso_local addrspace(4) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0
// RDC-DAG: @_ZL3sv2.static.[[HASH]] = dso_local addrspace(4) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
static __constant__ int sv2;
// DEV-DAG: @_ZL3sv3 = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
// NORDC-DAG: @_ZL3sv3 = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
// RDC-DAG: @_ZL3sv3.static.[[HASH]] = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global i32* null
static __managed__ int sv3;

View File

@ -2,19 +2,24 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=COMMON,DEV %s
// RUN: -check-prefixes=COMMON,DEV,NORDC-D %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=COMMON,DEV %s
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.dev
// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,RDC-D %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=COMMON,HOST,NORDC %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=COMMON,HOST,RDC %s
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.host
// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,RDC %s
// Check device and host compilation use the same postfix for static
// variable name.
// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
#include "Inputs/cuda.h"
@ -45,10 +50,17 @@ __managed__ vec v2[100] = {{1, 1, 1}};
// HOST-DAG: @ex = external externally_initialized global i32*
extern __managed__ int ex;
// DEV-DAG: @_ZL2sx.managed = dso_local addrspace(1) externally_initialized global i32 1, align 4
// DEV-DAG: @_ZL2sx = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
// NORDC-D-DAG: @_ZL2sx.managed = dso_local addrspace(1) externally_initialized global i32 1, align 4
// NORDC-D-DAG: @_ZL2sx = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = dso_local addrspace(1) externally_initialized global i32 1, align 4
// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
// HOST-DAG: @_ZL2sx.managed = internal global i32 1
// HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
// NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
// POSTFIX: @_ZL2sx.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
static __managed__ int sx = 1;
// DEV-DAG: @llvm.compiler.used
@ -154,6 +166,6 @@ __device__ __host__ int load4() {
}
// HOST-DAG: __hipRegisterManagedVar({{.*}}@x {{.*}}@x.managed {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4)
// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx {{.*}}@_ZL2sx.managed
// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx {{.*}}@_ZL2sx.managed {{.*}}@[[DEVNAMESX]]
// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex {{.*}}@ex.managed
// HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32)

View File

@ -0,0 +1,97 @@
// REQUIRES: x86-registered-target
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=DEV,INT-DEV %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=HOST,INT-HOST %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s > %t.dev
// RUN: cat %t.dev | FileCheck -check-prefixes=DEV,EXT-DEV %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=abc \
// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
// RUN: cat %t.host | FileCheck -check-prefixes=HOST,EXT-HOST %s
// Check host and device compilations use the same postfixes for static
// variable names.
// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
#include "Inputs/cuda.h"
// Test function scope static device variable, which should not be externalized.
// DEV-DAG: @_ZZ6kernelPiPPKiE1w = internal addrspace(4) constant i32 1
// HOST-DAG: @_ZL1x = internal global i32 undef
// HOST-DAG: @_ZL1y = internal global i32 undef
// Test normal static device variables
// INT-DEV-DAG: @_ZL1x = dso_local addrspace(1) externally_initialized global i32 0
// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
// Test externalized static device variables
// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
// POSTFIX: @_ZL1x.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
static __device__ int x;
// Test static device variables not used by host code should not be externalized
// DEV-DAG: @_ZL2x2 = internal addrspace(1) global i32 0
static __device__ int x2;
// Test normal static device variables
// INT-DEV-DAG: @_ZL1y = dso_local addrspace(4) externally_initialized global i32 0
// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
// Test externalized static device variables
// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = dso_local addrspace(4) externally_initialized global i32 0
// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
static __constant__ int y;
// Test static host variable, which should not be externalized nor registered.
// HOST-DAG: @_ZL1z = internal global i32 0
// DEV-NOT: @_ZL1z
static int z;
// Test static device variable in inline function, which should not be
// externalized nor registered.
// DEV-DAG: @_ZZ6devfunPPKiE1p = linkonce_odr addrspace(4) constant i32 2, comdat
inline __device__ void devfun(const int ** b) {
const static int p = 2;
b[0] = &p;
}
__global__ void kernel(int *a, const int **b) {
const static int w = 1;
a[0] = x;
a[1] = y;
b[0] = &w;
b[1] = &x2;
devfun(b);
}
int* getDeviceSymbol(int *x);
void foo() {
getDeviceSymbol(&x);
getDeviceSymbol(&y);
z = 123;
}
// HOST: __hipRegisterVar({{.*}}@_ZL1x {{.*}}@[[DEVNAMEX]]
// HOST: __hipRegisterVar({{.*}}@_ZL1y {{.*}}@[[DEVNAMEY]]
// HOST-NOT: __hipRegisterVar({{.*}}@_ZL2x2
// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6kernelPiPPKiE1w
// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6devfunPPKiE1p

View File

@ -0,0 +1,50 @@
// REQUIRES: x86-registered-target
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -emit-llvm -o - %s -fsyntax-only -verify=dev
// RUN: %clang_cc1 -triple x86_64-gnu-linux \
// RUN: -emit-llvm -o - %s -fsyntax-only -verify=host
// Checks allowed usage of file-scope and function-scope static variables.
// host-no-diagnostics
#include "Inputs/cuda.h"
// Checks static variables are allowed in device functions.
__device__ void f1() {
const static int b = 123;
static int a;
}
// Checks static variables are allowd in global functions.
__global__ void k1() {
const static int b = 123;
static int a;
}
// Checks static device and constant variables are allowed in device and
// host functions, and static host variables are not allowed in device
// functions.
static __device__ int x;
static __constant__ int y;
static int z;
__global__ void kernel(int *a) {
a[0] = x;
a[1] = y;
a[2] = z;
// dev-error@-1 {{reference to __host__ variable 'z' in __global__ function}}
}
int* getDeviceSymbol(int *x);
void foo() {
getDeviceSymbol(&x);
getDeviceSymbol(&y);
}