[CUDA] Allow the new driver to compile CUDA in non-RDC mode

The new driver primarily allows us to support RDC-mode compilations with
proper linking. This is not needed for non-RDC mode compilation, but we
still would like the new driver to be able to handle this mode so we can
transition away from the old driver in the future. This patch adds the
necessary code to support creating a fatbinary for CUDA code generation
as well as removing old assumptions and errors about RDC-mode with the
new driver.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D129655
This commit is contained in:
Joseph Huber 2022-07-13 11:25:31 -04:00
parent 403d61aedd
commit b370be37cc
8 changed files with 105 additions and 64 deletions

View File

@ -60,8 +60,6 @@ def err_drv_no_cuda_libdevice : Error<
"cannot find libdevice for %0; provide path to different CUDA installation "
"via '--cuda-path', or pass '-nocudalib' to build without linking with "
"libdevice">;
def err_drv_no_rdc_new_driver : Error<
"Using '--offload-new-driver' requires '-fgpu-rdc'">;
def err_drv_no_rocm_device_lib : Error<
"cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via "

View File

@ -212,8 +212,7 @@ static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
TheModule(CGM.getModule()),
RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode ||
CGM.getLangOpts().OffloadingNewDriver),
RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
DeviceMC(InitDeviceMC(CGM)) {
CodeGen::CodeGenTypes &Types = CGM.getTypes();
ASTContext &Ctx = CGM.getContext();
@ -1172,10 +1171,11 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return nullptr;
}
if (!(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
createOffloadingEntries();
else
return makeModuleCtorFunction();
createOffloadingEntries();
return nullptr;
}

View File

@ -2930,7 +2930,7 @@ class OffloadingActionBuilder final {
return false;
Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
options::OPT_fno_gpu_rdc, /*Default=*/false);
options::OPT_fno_gpu_rdc, /*Default=*/false);
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
assert(HostTC && "No toolchain for host compilation.");
@ -4486,11 +4486,23 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
if (offloadDeviceOnly())
return C.MakeAction<OffloadAction>(DDeps, types::TY_Nothing);
Action *OffloadPackager =
C.MakeAction<OffloadPackagerJobAction>(OffloadActions, types::TY_Image);
OffloadAction::DeviceDependences DDep;
DDep.add(*OffloadPackager, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
nullptr, Action::OFK_None);
if (C.isOffloadingHostKind(Action::OFK_Cuda) &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) {
// If we are not in RDC-mode we just emit the final CUDA fatbinary for each
// translation unit without requiring any linking.
Action *FatbinAction =
C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
nullptr, Action::OFK_Cuda);
} else {
// Package all the offloading actions into a single output that can be
// embedded in the host and linked.
Action *PackagerAction =
C.MakeAction<OffloadPackagerJobAction>(OffloadActions, types::TY_Image);
DDep.add(*PackagerAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
nullptr, Action::OFK_None);
}
OffloadAction::HostDependence HDep(
*HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
/*BoundArch=*/nullptr, isa<CompileJobAction>(HostAction) ? DDep : DDeps);

View File

@ -4441,12 +4441,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false));
bool IsRDCMode =
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction);
auto LTOMode = D.getLTOMode(IsDeviceOffloadAction);
// A header module compilation doesn't have a main input file, so invent a
// fake one as a placeholder.
const char *ModuleName = [&]{
const char *ModuleName = [&] {
auto *ModuleNameArg = Args.getLastArg(options::OPT_fmodule_name_EQ);
return ModuleNameArg ? ModuleNameArg->getValue() : "";
}();
@ -6304,10 +6306,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
}
if (IsCuda || IsHIP) {
if (!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
Args.hasArg(options::OPT_offload_new_driver))
D.Diag(diag::err_drv_no_rdc_new_driver);
if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
if (IsRDCMode)
CmdArgs.push_back("-fgpu-rdc");
if (Args.hasFlag(options::OPT_fgpu_defer_diag,
options::OPT_fno_gpu_defer_diag, false))
@ -6979,11 +6978,22 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
}
}
// Host-side cuda compilation receives all device-side outputs in a single
// fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
// Host-side offloading compilation receives all device-side outputs. Include
// them in the host compilation depending on the target. If the host inputs
// are not empty we use the new-driver scheme, otherwise use the old scheme.
if ((IsCuda || IsHIP) && CudaDeviceInput) {
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
} else if (!HostOffloadingInputs.empty()) {
if (IsCuda && !IsRDCMode) {
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
} else {
for (const InputInfo Input : HostOffloadingInputs)
CmdArgs.push_back(Args.MakeArgString("-fembed-offload-object=" +
TC.getInputFilename(Input)));
}
}
if (IsCuda) {
@ -7032,12 +7042,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
}
}
// Host-side offloading recieves the device object files and embeds it in a
// named section including the associated target triple and architecture.
for (const InputInfo Input : HostOffloadingInputs)
CmdArgs.push_back(Args.MakeArgString("-fembed-offload-object=" +
TC.getInputFilename(Input)));
if (Triple.isAMDGPU()) {
handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs);

View File

@ -1,8 +1,8 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex ".omp_offloading.entry.*"
// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \
// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fgpu-rdc \
// RUN: --offload-new-driver -emit-llvm -o - -x cuda %s | FileCheck \
// RUN: --check-prefix=CUDA %s
// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu \
// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fgpu-rdc \
// RUN: --offload-new-driver -emit-llvm -o - -x hip %s | FileCheck \
// RUN: --check-prefix=HIP %s

View File

@ -25,34 +25,34 @@
// Same tests for OpenMP
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -g -gz 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -g -gz 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -gdwarf -fdebug-info-for-profiling 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -gdwarf -fdebug-info-for-profiling 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -gdwarf-2 -gsplit-dwarf 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -gdwarf-2 -gsplit-dwarf 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -gdwarf-3 -glldb 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -gdwarf-3 -glldb 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -gdwarf-4 -gcodeview 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -gdwarf-4 -gcodeview 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -gdwarf-5 -gmodules 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -gdwarf-5 -gmodules 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -ggdb1 -fdebug-macro 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -ggdb1 -fdebug-macro 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -ggdb2 -ggnu-pubnames 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -ggdb2 -ggnu-pubnames 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -ggdb3 -gdwarf-aranges 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -ggdb3 -gdwarf-aranges 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -g -gcolumn-info -fdebug-types-section 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: -fgpu-rdc -g -gcolumn-info -fdebug-types-section 2>&1 | FileCheck %s --check-prefixes WARN,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -c %s -gdwarf-5 -gembed-source 2>&1 \
// RUN: | FileCheck %s --check-prefixes WARN-GES,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -c %s -ggdb -gembed-source -gdwarf-5 2>&1 \
// RUN: | FileCheck %s --check-prefixes WARN-GES,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -gdwarf-5 -gembed-source 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON
// RUN: -fgpu-rdc -gdwarf-5 -gembed-source 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON
// RUN: %clang -### -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -c %s \
// RUN: -ggdb -gembed-source -gdwarf-5 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON
// RUN: -fgpu-rdc -ggdb -gembed-source -gdwarf-5 2>&1 | FileCheck %s --check-prefixes WARN-GES,COMMON
// COMMON: warning: debug information option '{{-gz|-fdebug-info-for-profiling|-gsplit-dwarf|-glldb|-gcodeview|-gmodules|-gembed-source|-fdebug-macro|-ggnu-pubnames|-gdwarf-aranges|-fdebug-types-section}}' is not supported
// WARN-SAME: for target 'nvptx64-nvidia-cuda' [-Wunsupported-target-opt]

View File

@ -13,9 +13,6 @@
// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
// RUN: %clang -### -nocudalib --offload-new-driver %s 2>&1 | FileCheck -check-prefix RDC %s
// RDC: error: Using '--offload-new-driver' requires '-fgpu-rdc'
// RUN: %clang -### -target x86_64-linux-gnu -nocudalib -ccc-print-bindings -fgpu-rdc \
// RUN: --offload-new-driver --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
// RUN: | FileCheck -check-prefix BINDINGS-HOST %s
@ -37,3 +34,10 @@
// RUN: | FileCheck -check-prefix DEVICE-LINK %s
// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out"
// RUN: %clang -### -target x86_64-linux-gnu -nocudalib --offload-new-driver \
// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
// RUN: | FileCheck -check-prefix GPU-BINARY %s
// GPU-BINARY: fatbinary{{.*}}"--create" "{{.*}}.fatbin"
// GPU-BINARY: -cc1{{.*}}-fcuda-include-gpubinary" "{{.*}}.fatbin"

View File

@ -221,25 +221,48 @@
//
// Test the phases generated when using the new offloading driver.
//
// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver \
// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW_DRIVER %s
// NEW_DRIVER: 0: input, "[[INPUT:.+]]", cuda
// NEW_DRIVER: 1: preprocessor, {0}, cuda-cpp-output
// NEW_DRIVER: 2: compiler, {1}, ir
// NEW_DRIVER: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52)
// NEW_DRIVER: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52)
// NEW_DRIVER: 5: compiler, {4}, ir, (device-cuda, sm_52)
// NEW_DRIVER: 6: backend, {5}, assembler, (device-cuda, sm_52)
// NEW_DRIVER: 7: assembler, {6}, object, (device-cuda, sm_52)
// NEW_DRIVER: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
// NEW_DRIVER: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70)
// NEW_DRIVER: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70)
// NEW_DRIVER: 11: compiler, {10}, ir, (device-cuda, sm_70)
// NEW_DRIVER: 12: backend, {11}, assembler, (device-cuda, sm_70)
// NEW_DRIVER: 13: assembler, {12}, object, (device-cuda, sm_70)
// NEW_DRIVER: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object
// NEW_DRIVER: 15: clang-offload-packager, {8, 14}, image
// NEW_DRIVER: 16: offload, " (powerpc64le-ibm-linux-gnu)" {2}, " (powerpc64le-ibm-linux-gnu)" {15}, ir
// NEW_DRIVER: 17: backend, {16}, assembler, (host-cuda)
// NEW_DRIVER: 18: assembler, {17}, object, (host-cuda)
// NEW_DRIVER: 19: clang-linker-wrapper, {18}, image, (host-cuda)
// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver -fgpu-rdc \
// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW-DRIVER-RDC %s
// NEW-DRIVER-RDC: 0: input, "[[INPUT:.+]]", cuda
// NEW-DRIVER-RDC: 1: preprocessor, {0}, cuda-cpp-output
// NEW-DRIVER-RDC: 2: compiler, {1}, ir
// NEW-DRIVER-RDC: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52)
// NEW-DRIVER-RDC: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52)
// NEW-DRIVER-RDC: 5: compiler, {4}, ir, (device-cuda, sm_52)
// NEW-DRIVER-RDC: 6: backend, {5}, assembler, (device-cuda, sm_52)
// NEW-DRIVER-RDC: 7: assembler, {6}, object, (device-cuda, sm_52)
// NEW-DRIVER-RDC: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
// NEW-DRIVER-RDC: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70)
// NEW-DRIVER-RDC: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70)
// NEW-DRIVER-RDC: 11: compiler, {10}, ir, (device-cuda, sm_70)
// NEW-DRIVER-RDC: 12: backend, {11}, assembler, (device-cuda, sm_70)
// NEW-DRIVER-RDC: 13: assembler, {12}, object, (device-cuda, sm_70)
// NEW-DRIVER-RDC: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object
// NEW-DRIVER-RDC: 15: clang-offload-packager, {8, 14}, image
// NEW-DRIVER-RDC: 16: offload, " (powerpc64le-ibm-linux-gnu)" {2}, " (powerpc64le-ibm-linux-gnu)" {15}, ir
// NEW-DRIVER-RDC: 17: backend, {16}, assembler, (host-cuda)
// NEW-DRIVER-RDC: 18: assembler, {17}, object, (host-cuda)
// NEW-DRIVER-RDC: 19: clang-linker-wrapper, {18}, image, (host-cuda)
// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver -fgpu-rdc \
// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW-DRIVER %s
// NEW-DRIVER: 0: input, "[[INPUT:.+]]", cuda
// NEW-DRIVER: 1: preprocessor, {0}, cuda-cpp-output
// NEW-DRIVER: 2: compiler, {1}, ir
// NEW-DRIVER: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52)
// NEW-DRIVER: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52)
// NEW-DRIVER: 5: compiler, {4}, ir, (device-cuda, sm_52)
// NEW-DRIVER: 6: backend, {5}, assembler, (device-cuda, sm_52)
// NEW-DRIVER: 7: assembler, {6}, object, (device-cuda, sm_52)
// NEW-DRIVER: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
// NEW-DRIVER: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70)
// NEW-DRIVER: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70)
// NEW-DRIVER: 11: compiler, {10}, ir, (device-cuda, sm_70)
// NEW-DRIVER: 12: backend, {11}, assembler, (device-cuda, sm_70)
// NEW-DRIVER: 13: assembler, {12}, object, (device-cuda, sm_70)
// NEW-DRIVER: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object
// NEW-DRIVER: 15: clang-offload-packager, {8, 14}, image
// NEW-DRIVER: 16: offload, " (powerpc64le-ibm-linux-gnu)" {2}, " (powerpc64le-ibm-linux-gnu)" {15}, ir
// NEW-DRIVER: 17: backend, {16}, assembler, (host-cuda)
// NEW-DRIVER: 18: assembler, {17}, object, (host-cuda)
// NEW-DRIVER: 19: clang-linker-wrapper, {18}, image, (host-cuda)