forked from OSchip/llvm-project
[HIP] Support `-fgpu-default-stream`
Introduce -fgpu-default-stream={legacy|per-thread} option to support per-thread default stream for HIP runtime. When -fgpu-default-stream=per-thread, HIP kernels are launched through hipLaunchKernel_spt instead of hipLaunchKernel. Also HIP_API_PER_THREAD_DEFAULT_STREAM=1 is defined by the preprocessor to enable other per-thread stream API's. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D120298
This commit is contained in:
parent
b01430a04f
commit
9d899d8f01
|
@ -309,6 +309,13 @@ public:
|
|||
ExtendTo64
|
||||
};
|
||||
|
||||
enum class GPUDefaultStreamKind {
|
||||
/// Legacy default stream
|
||||
Legacy,
|
||||
/// Per-thread default stream
|
||||
PerThread,
|
||||
};
|
||||
|
||||
public:
|
||||
/// The used language standard.
|
||||
LangStandard::Kind LangStd;
|
||||
|
@ -402,6 +409,9 @@ public:
|
|||
/// input is a header file (i.e. -x c-header).
|
||||
bool IsHeaderFile = false;
|
||||
|
||||
/// The default stream kind used for HIP kernel launching.
|
||||
GPUDefaultStreamKind GPUDefaultStream;
|
||||
|
||||
LangOptions();
|
||||
|
||||
// Define accessors/mutators for language options of enumeration type.
|
||||
|
|
|
@ -959,6 +959,13 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
|
|||
TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
|
||||
PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
|
||||
NegFlag<SetFalse>>;
|
||||
def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
|
||||
HelpText<"Specify default stream. Valid values are 'legacy' and 'per-thread'. The default value is 'legacy'. (HIP only)">,
|
||||
Flags<[CC1Option]>,
|
||||
Values<"legacy,per-thread">,
|
||||
NormalizedValuesScope<"LangOptions::GPUDefaultStreamKind">,
|
||||
NormalizedValues<["Legacy", "PerThread"]>,
|
||||
MarshallingInfoEnum<LangOpts<"GPUDefaultStream">, "Legacy">;
|
||||
def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
|
||||
HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
|
||||
def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
|
||||
|
|
|
@ -332,15 +332,22 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
|
|||
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
||||
|
||||
// Lookup cudaLaunchKernel/hipLaunchKernel function.
|
||||
// HIP kernel launching API name depends on -fgpu-default-stream option. For
|
||||
// the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
|
||||
// it is hipLaunchKernel_spt.
|
||||
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||
// void **args, size_t sharedMem,
|
||||
// cudaStream_t stream);
|
||||
// hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||
// void **args, size_t sharedMem,
|
||||
// hipStream_t stream);
|
||||
// hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
|
||||
// dim3 blockDim, void **args,
|
||||
// size_t sharedMem, hipStream_t stream);
|
||||
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
|
||||
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
|
||||
auto LaunchKernelName = addPrefixToName("LaunchKernel");
|
||||
std::string KernelLaunchAPI = "LaunchKernel";
|
||||
if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream ==
|
||||
LangOptions::GPUDefaultStreamKind::PerThread)
|
||||
KernelLaunchAPI = KernelLaunchAPI + "_spt";
|
||||
auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
|
||||
IdentifierInfo &cudaLaunchKernelII =
|
||||
CGM.getContext().Idents.get(LaunchKernelName);
|
||||
FunctionDecl *cudaLaunchKernelFD = nullptr;
|
||||
|
|
|
@ -6915,8 +6915,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
CmdArgs.push_back(Args.MakeArgString(Twine("-cuid=") + Twine(CUID)));
|
||||
}
|
||||
|
||||
if (IsHIP)
|
||||
if (IsHIP) {
|
||||
CmdArgs.push_back("-fcuda-allow-variadic-functions");
|
||||
Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ);
|
||||
}
|
||||
|
||||
if (IsCudaDevice || IsHIPDevice) {
|
||||
StringRef InlineThresh =
|
||||
|
|
|
@ -538,6 +538,9 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
|
|||
Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
|
||||
if (LangOpts.CUDAIsDevice)
|
||||
Builder.defineMacro("__HIP_DEVICE_COMPILE__");
|
||||
if (LangOpts.GPUDefaultStream ==
|
||||
LangOptions::GPUDefaultStreamKind::PerThread)
|
||||
Builder.defineMacro("HIP_API_PER_THREAD_DEFAULT_STREAM");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -35,11 +35,18 @@ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
|
|||
extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
|
||||
size_t sharedSize = 0,
|
||||
hipStream_t stream = 0);
|
||||
#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
|
||||
extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
|
||||
dim3 blockDim, void **args,
|
||||
size_t sharedMem,
|
||||
hipStream_t stream);
|
||||
#else
|
||||
extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
|
||||
dim3 blockDim, void **args,
|
||||
size_t sharedMem,
|
||||
hipStream_t stream);
|
||||
#endif //HIP_API_PER_THREAD_DEFAULT_STREAM
|
||||
#else
|
||||
typedef struct cudaStream *cudaStream_t;
|
||||
typedef enum cudaError {} cudaError_t;
|
||||
extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
|
||||
|
|
|
@ -5,7 +5,13 @@
|
|||
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
|
||||
// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
|
||||
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
|
||||
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK
|
||||
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
|
||||
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
|
||||
// RUN: -fgpu-default-stream=legacy \
|
||||
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
|
||||
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
|
||||
// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \
|
||||
// RUN: | FileCheck %s --check-prefixes=HIP-NEW,PTH,CHECK
|
||||
|
||||
#include "Inputs/cuda.h"
|
||||
|
||||
|
@ -13,7 +19,8 @@
|
|||
// HIP-OLD: call{{.*}}hipSetupArgument
|
||||
// HIP-OLD: call{{.*}}hipLaunchByPtr
|
||||
// HIP-NEW: call{{.*}}__hipPopCallConfiguration
|
||||
// HIP-NEW: call{{.*}}hipLaunchKernel
|
||||
// LEGACY: call{{.*}}hipLaunchKernel
|
||||
// PTH: call{{.*}}hipLaunchKernel_spt
|
||||
// CUDA-OLD: call{{.*}}cudaSetupArgument
|
||||
// CUDA-OLD: call{{.*}}cudaLaunch
|
||||
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
|
||||
|
|
|
@ -14,6 +14,14 @@
|
|||
// DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
|
||||
// DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
|
||||
|
||||
// Check -fgpu-default-stream=per-thread.
|
||||
// RUN: %clang -### -nogpuinc -nogpulib -fgpu-default-stream=per-thread \
|
||||
// RUN: %s -save-temps 2>&1 | FileCheck -check-prefix=PTH %s
|
||||
// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
|
||||
// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
|
||||
// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
|
||||
// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
|
||||
|
||||
// RUN: %clang -### -x hip -target x86_64-pc-windows-msvc -fms-extensions \
|
||||
// RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \
|
||||
// RUN: FileCheck -check-prefix=MLLVM %s
|
||||
|
|
|
@ -247,6 +247,7 @@
|
|||
// CHECK-HIP-NEG-NOT: #define __CUDA_ARCH__
|
||||
// CHECK-HIP-NEG-NOT: #define __HIP_DEVICE_COMPILE__ 1
|
||||
// CHECK-HIP-NEG-NOT: #define __CLANG_RDC__ 1
|
||||
// CHECK-HIP-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
|
||||
|
||||
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
|
||||
// RUN: -fcuda-is-device \
|
||||
|
@ -265,6 +266,7 @@
|
|||
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-DEV-NEG
|
||||
// CHECK-HIP-DEV-NEG-NOT: #define __CUDA_ARCH__
|
||||
// CHECK-HIP-DEV-NEG-NOT: #define __CLANG_RDC__ 1
|
||||
// CHECK-HIP-DEV-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
|
||||
|
||||
// RUN: %clang_cc1 %s -E -dM -o - -x cuda -triple x86_64-unknown-linux-gnu \
|
||||
// RUN: -fgpu-rdc | FileCheck %s --check-prefix=CHECK-RDC
|
||||
|
@ -277,3 +279,11 @@
|
|||
// RUN: -fgpu-rdc -fcuda-is-device \
|
||||
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RDC
|
||||
// CHECK-RDC: #define __CLANG_RDC__ 1
|
||||
|
||||
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \
|
||||
// RUN: -fgpu-default-stream=per-thread \
|
||||
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
|
||||
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
|
||||
// RUN: -fcuda-is-device -fgpu-default-stream=per-thread \
|
||||
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
|
||||
// CHECK-PTH: #define HIP_API_PER_THREAD_DEFAULT_STREAM 1
|
||||
|
|
Loading…
Reference in New Issue