[HIP] Support early finalization of device code for -fno-gpu-rdc

This patch renames -f{no-}cuda-rdc to -f{no-}gpu-rdc and keeps the original options as aliases. When -fgpu-rdc is off, clang will assume the device code in each translation unit does not call external functions except those in the device library, therefore it is possible to compile the device code in each translation unit to self-contained kernels and embed them in the host object, so that the host object behaves like usual host object which can be linked by lld. The benefits of this feature is: 1. allow users to create static libraries which can be linked by host linker; 2. amortized device code linking time. This patch modifies HIP action builder to insert actions for linking device code and generating HIP fatbin, and pass HIP fatbin to host backend action. It extracts code for constructing command for generating HIP fatbin as a function so that it can be reused by early finalization. It also modifies codegen of HIP host constructor functions to embed the device fatbin when it is available. Differential Revision: https://reviews.llvm.org/D52377 llvm-svn: 343611
2018-10-02 17:48:54 +00:00 · 2018-10-02 17:48:54 +00:00 · 9767089d00
parent 2b5259afb3
commit 9767089d00
20 changed files with 370 additions and 105 deletions
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@ -211,7 +211,7 @@ LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
-LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
+LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")

 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@ -584,9 +584,11 @@ def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-
 def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
  Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
 def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
-def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option]>,
+def fgpu_rdc : Flag<["-"], "fgpu-rdc">, Flags<[CC1Option]>,
  HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
-def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
+def fno_gpu_rdc : Flag<["-"], "fno-gpu-rdc">;
+def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
+def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
 def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
  HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
 def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
--- a/clang/include/clang/Driver/Types.def
+++ b/clang/include/clang/Driver/Types.def
@ -101,4 +101,5 @@ TYPE("image",                    Image,        INVALID,         "out",   "")
 TYPE("dSYM",                     dSYM,         INVALID,         "dSYM",  "A")
 TYPE("dependencies",             Dependencies, INVALID,         "d",     "")
 TYPE("cuda-fatbin",              CUDA_FATBIN,  INVALID,         "fatbin","A")
+TYPE("hip-fatbin",               HIP_FATBIN,   INVALID,         "hipfb", "A")
 TYPE("none",                     Nothing,      INVALID,         nullptr, "u")
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@ -2459,7 +2459,7 @@ bool VarDecl::isKnownToBeDefined() const {
  //
  // With CUDA relocatable device code enabled, these variables don't get
  // special handling; they're treated like regular extern variables.
-  if (LangOpts.CUDA && !LangOpts.CUDARelocatableDeviceCode &&
+  if (LangOpts.CUDA && !LangOpts.GPURelocatableDeviceCode &&
      hasExternalStorage() && hasAttr<CUDASharedAttr>() &&
      isa<IncompleteArrayType>(getType()))
    return true;
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@ -137,7 +137,7 @@ CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
      TheModule(CGM.getModule()),
-      RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
+      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) {
  CodeGen::CodeGenTypes &Types = CGM.getTypes();
  ASTContext &Ctx = CGM.getContext();

@ -353,8 +353,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  // global variable and save a reference in GpuBinaryHandle to be cleaned up
  // in destructor on exit. Then associate all known kernels with the GPU binary
  // handle so CUDA runtime can figure out what to call on the GPU side.
-  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
-  if (!IsHIP) {
+  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
+  if (!CudaGpuBinaryFileName.empty()) {
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
@ -388,15 +388,23 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    ModuleIDSectionName = "__hip_module_id";
    ModuleIDPrefix = "__hip_";

-    // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
-    // The external symbol is supposed to contain the fat binary but will be
-    // populated somewhere else, e.g. by lld through link script.
-    FatBinStr = new llvm::GlobalVariable(
+    if (CudaGpuBinary) {
+      // If fatbin is available from early finalization, create a string
+      // literal containing the fat binary loaded from the given file.
+      FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+                                     FatbinConstantName, 8);
+    } else {
+      // If fatbin is not available, create an external symbol
+      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
+      // to contain the fat binary but will be populated somewhere else,
+      // e.g. by lld through link script.
+      FatBinStr = new llvm::GlobalVariable(
        CGM.getModule(), CGM.Int8Ty,
        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
        "__hip_fatbin", nullptr,
        llvm::GlobalVariable::NotThreadLocal);
-    cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+    }

    FatMagic = HIPFatMagic;
  } else {
@ -447,6 +455,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  // thread safety of the loaded program. Therefore we can assume sequential
  // execution of constructor functions here.
  if (IsHIP) {
+    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
+        llvm::GlobalValue::LinkOnceAnyLinkage;
    llvm::BasicBlock *IfBlock =
        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
    llvm::BasicBlock *ExitBlock =
@ -455,12 +465,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    // of HIP ABI.
    GpuBinaryHandle = new llvm::GlobalVariable(
        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::LinkOnceAnyLinkage,
+        Linkage,
        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
        "__hip_gpubin_handle");
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
    // Prevent the weak symbol in different shared libraries being merged.
-    GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    if (Linkage != llvm::GlobalValue::InternalLinkage)
+      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
    Address GpuBinaryAddr(
        GpuBinaryHandle,
        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@ -2486,11 +2486,13 @@ class OffloadingActionBuilder final {
  class HIPActionBuilder final : public CudaActionBuilderBase {
    /// The linker inputs obtained for each device arch.
    SmallVector<ActionList, 8> DeviceLinkerInputs;
+    bool Relocatable;

  public:
    HIPActionBuilder(Compilation &C, DerivedArgList &Args,
                     const Driver::InputList &Inputs)
-        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP),
+          Relocatable(false) {}

    bool canUseBundlerUnbundler() const override { return true; }

@ -2499,23 +2501,68 @@ class OffloadingActionBuilder final {
                         phases::ID CurPhase, phases::ID FinalPhase,
                         PhasesTy &Phases) override {
      // amdgcn does not support linking of object files, therefore we skip
-      // backend and assemble phases to output LLVM IR.
-      if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+      // backend and assemble phases to output LLVM IR. Except for generating
+      // non-relocatable device coee, where we generate fat binary for device
+      // code and pass to host in Backend phase.
+      if (CudaDeviceActions.empty() ||
+          (CurPhase == phases::Backend && Relocatable) ||
          CurPhase == phases::Assemble)
        return ABRT_Success;

-      assert((CurPhase == phases::Link ||
+      assert(((CurPhase == phases::Link && Relocatable) ||
              CudaDeviceActions.size() == GpuArchList.size()) &&
             "Expecting one action per GPU architecture.");
      assert(!CompileHostOnly &&
             "Not expecting CUDA actions in host-only compilation.");

-      // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
-      // This happens to each device action originated from each input file.
-      // Later on, device actions in DeviceLinkerInputs are used to create
-      // device link actions in appendLinkDependences and the created device
-      // link actions are passed to the offload action as device dependence.
-      if (CurPhase == phases::Link) {
+      if (!Relocatable && CurPhase == phases::Backend) {
+        // If we are in backend phase, we attempt to generate the fat binary.
+        // We compile each arch to IR and use a link action to generate code
+        // object containing ISA. Then we use a special "link" action to create
+        // a fat binary containing all the code objects for different GPU's.
+        // The fat binary is then an input to the host action.
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Create a link action to link device IR with device library
+          // and generate ISA.
+          ActionList AL;
+          AL.push_back(CudaDeviceActions[I]);
+          CudaDeviceActions[I] =
+              C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+
+          // OffloadingActionBuilder propagates device arch until an offload
+          // action. Since the next action for creating fatbin does
+          // not have device arch, whereas the above link action and its input
+          // have device arch, an offload action is needed to stop the null
+          // device arch of the next action being propagated to the above link
+          // action.
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*CudaDeviceActions[I], *ToolChains.front(),
+                   CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+          CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+              DDep, CudaDeviceActions[I]->getType());
+        }
+        // Create HIP fat binary with a special "link" action.
+        CudaFatBinary =
+            C.MakeAction<LinkJobAction>(CudaDeviceActions,
+                types::TY_HIP_FATBIN);
+
+        DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+               AssociatedOffloadKind);
+        // Clear the fat binary, it is already a dependence to an host
+        // action.
+        CudaFatBinary = nullptr;
+
+        // Remove the CUDA actions as they are already connected to an host
+        // action or fat binary.
+        CudaDeviceActions.clear();
+
+        return ABRT_Success;
+      } else if (CurPhase == phases::Link) {
+        // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+        // This happens to each device action originated from each input file.
+        // Later on, device actions in DeviceLinkerInputs are used to create
+        // device link actions in appendLinkDependences and the created device
+        // link actions are passed to the offload action as device dependence.
        DeviceLinkerInputs.resize(CudaDeviceActions.size());
        auto LI = DeviceLinkerInputs.begin();
        for (auto *A : CudaDeviceActions) {
@ -2548,6 +2595,13 @@ class OffloadingActionBuilder final {
        ++I;
      }
    }
+
+    bool initialize() override {
+      Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+          options::OPT_fno_gpu_rdc, /*Default=*/false);
+
+      return CudaActionBuilderBase::initialize();
+    }
  };

  /// OpenMP action builder. The host bitcode is passed to the device frontend
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@ -4920,16 +4920,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
    CmdArgs.push_back(Args.MakeArgString(Flags));
  }

-  if (IsCuda) {
-    // Host-side cuda compilation receives all device-side outputs in a single
-    // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
-    if (CudaDeviceInput) {
+  // Host-side cuda compilation receives all device-side outputs in a single
+  // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
+  if ((IsCuda || IsHIP) && CudaDeviceInput) {
      CmdArgs.push_back("-fcuda-include-gpubinary");
      CmdArgs.push_back(CudaDeviceInput->getFilename());
-    }
+      if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+        CmdArgs.push_back("-fgpu-rdc");
+  }

-    if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
-      CmdArgs.push_back("-fcuda-rdc");
+  if (IsCuda) {
    if (Args.hasFlag(options::OPT_fcuda_short_ptr,
                     options::OPT_fno_cuda_short_ptr, false))
      CmdArgs.push_back("-fcuda-short-ptr");
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@ -15,6 +15,7 @@
 #include "Arch/SystemZ.h"
 #include "Arch/X86.h"
 #include "Hexagon.h"
+#include "HIP.h"
 #include "InputInfo.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LangOptions.h"
@ -1337,6 +1338,18 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
  if (!JA.isHostOffloading(Action::OFK_HIP))
    return;

+  InputInfoList DeviceInputs;
+  for (const auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+      DeviceInputs.push_back(II);
+    }
+  }
+
+  if (DeviceInputs.empty())
+    return;
+
  // Create temporary linker script. Keep it if save-temps is enabled.
  const char *LKS;
  SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
@ -1364,39 +1377,12 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
         "Wrong platform");
  (void)HIPTC;

-  // Construct clang-offload-bundler command to bundle object files for
-  // for different GPU archs.
-  ArgStringList BundlerArgs;
-  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
-
-  // ToDo: Remove the dummy host binary entry which is required by
-  // clang-offload-bundler.
-  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
-  std::string BundlerInputArg = "-inputs=/dev/null";
-
-  for (const auto &II : Inputs) {
-    const Action *A = II.getAction();
-    // Is this a device linking action?
-    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
-      BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
-                         StringRef(A->getOffloadingArch()).str();
-      BundlerInputArg = BundlerInputArg + "," + II.getFilename();
-    }
-  }
-  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
-  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
-
-  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+  // The output file name needs to persist through the compilation, therefore
+  // it needs to be created through MakeArgString.
+  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb");
  const char *BundleFile =
      C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
-  auto BundlerOutputArg =
-      Args.MakeArgString(std::string("-outputs=").append(BundleFile));
-  BundlerArgs.push_back(BundlerOutputArg);
-
-  SmallString<128> BundlerPath(C.getDriver().Dir);
-  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
-  const char *Bundler = Args.MakeArgString(BundlerPath);
-  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+  AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T);

  // Add commands to embed target binaries. We ensure that each section and
  // image is 16-byte aligned. This is not mandatory, but increases the
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@ -398,8 +398,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                               options::OPT_fnoopenmp_relocatable_target,
                               /*Default=*/true);
  else if (JA.isOffloading(Action::OFK_Cuda))
-    Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
-                               options::OPT_fno_cuda_rdc, /*Default=*/false);
+    Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+                               options::OPT_fno_gpu_rdc, /*Default=*/false);

  if (Relocatable)
    CmdArgs.push_back("-c");
@ -609,9 +609,9 @@ void CudaToolChain::addClangTargetOptions(
                           options::OPT_fno_cuda_approx_transcendentals, false))
      CC1Args.push_back("-fcuda-approx-transcendentals");

-    if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+    if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                           false))
-      CC1Args.push_back("-fcuda-rdc");
+      CC1Args.push_back("-fgpu-rdc");
  }

  if (DriverArgs.hasArg(options::OPT_nocudalib))
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@ -184,6 +184,40 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
  C.addCommand(llvm::make_unique<Command>(JA, *this, Lld, LldArgs, Inputs));
 }

+// Construct a clang-offload-bundler command to bundle code objects for
+// different GPU's into a HIP fat binary.
+void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &Args, const Tool& T) {
+  // Construct clang-offload-bundler command to bundle object files for
+  // for different GPU archs.
+  ArgStringList BundlerArgs;
+  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerInputArg = "-inputs=/dev/null";
+
+  for (const auto &II : Inputs) {
+    const auto* A = II.getAction();
+    BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+                       StringRef(A->getOffloadingArch()).str();
+    BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+  }
+  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(OutputFileName));
+  BundlerArgs.push_back(BundlerOutputArg);
+
+  SmallString<128> BundlerPath(C.getDriver().Dir);
+  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+  const char *Bundler = Args.MakeArgString(BundlerPath);
+  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+}
+
 // For amdgcn the inputs of the linker job are device bitcode and output is
 // object file. It calls llvm-link, opt, llc, then lld steps.
 void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@ -192,6 +226,9 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                   const ArgList &Args,
                                   const char *LinkingOutput) const {

+  if (JA.getType() == types::TY_HIP_FATBIN)
+    return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this);
+
  assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn &&
         "Unsupported target");

@ -244,9 +281,9 @@ void HIPToolChain::addClangTargetOptions(
                         options::OPT_fno_cuda_approx_transcendentals, false))
    CC1Args.push_back("-fcuda-approx-transcendentals");

-  if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+  if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                         false))
-    CC1Args.push_back("-fcuda-rdc");
+    CC1Args.push_back("-fgpu-rdc");

  // Default to "hidden" visibility, as object level linking will not be
  // supported for the foreseeable future.
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@ -19,6 +19,11 @@ namespace driver {
 namespace tools {

 namespace AMDGCN {
+  // Construct command for creating HIP fatbin.
+  void constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+                  StringRef OutputFileName, const InputInfoList &Inputs,
+                  const llvm::opt::ArgList &TCArgs, const Tool& T);
+
 // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
 // device library, then compiles it to ISA in a shared object.
 class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@ -2220,7 +2220,7 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
  if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
    Opts.CUDADeviceApproxTranscendentals = 1;

-  Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+  Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc);

  if (Opts.ObjC1) {
    if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@ -4143,7 +4143,7 @@ static void handleSharedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
  const auto *VD = cast<VarDecl>(D);
  // extern __shared__ is only allowed on arrays with no length (e.g.
  // "int x[]").
-  if (!S.getLangOpts().CUDARelocatableDeviceCode && VD->hasExternalStorage() &&
+  if (!S.getLangOpts().GPURelocatableDeviceCode && VD->hasExternalStorage() &&
      !isa<IncompleteArrayType>(VD->getType())) {
    S.Diag(AL.getLoc(), diag::err_cuda_extern_shared) << VD;
    return;
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@ -6,22 +6,22 @@
 // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN

 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS -x hip \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,NORDC,HIP,HIPNEF

 #include "Inputs/cuda.h"

@ -64,8 +64,9 @@ void use_pointers() {
 // * constant unnamed string with the kernel name
 // ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
 // * constant unnamed string with GPU binary
-// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
 // CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
 // CUDANORDC-SAME: section ".nv_fatbin", align 8
 // CUDARDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
@ -74,13 +75,14 @@ void use_pointers() {
 // CUDA-SAME: { i32 1180844977, i32 1,
 // HIP-SAME: { i32 1212764230, i32 1,
 // CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
-// HIP-SAME:  i8* @[[FATBIN]],
+// HIPEF-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
+// HIPNEF-SAME:  i8* @[[FATBIN]],
 // ALL-SAME: i8* null }
 // CUDA-SAME: section ".nvFatBinSegment"
 // HIP-SAME: section ".hipFatBinSegment"
 // * variable to save GPU binary handle after initialization
 // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
-// HIP: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
+// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
 // * constant unnamed string with NVModuleID
 // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
@ -157,7 +159,7 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // device-side globals, but we still need to register GPU binary.
 // Skip GPU binary string first.
 // CUDANOGLOBALS: @{{.*}} = private constant{{.*}}
-// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
+// HIPNOGLOBALS: @{{.*}} = internal constant{{.*}}
 // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
 // NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
 // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
--- a/clang/test/Driver/cuda-external-tools.cu
+++ b/clang/test/Driver/cuda-external-tools.cu
@ -19,7 +19,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
 // Generating relocatable device code
-// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s

 // With debugging enabled, ptxas should be run with with no ptxas optimizations.
@ -46,21 +46,21 @@
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
 // Separate compilation targeting sm_35.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s

 // 32-bit compile.
 // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
 // 32-bit compile when generating relocatable device code.
-// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-linux-gnu -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s

 // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
 // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
 // Check that we still pass -c when generating relocatable device code.
-// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s

 // Check -Xcuda-ptxas and -Xcuda-fatbinary
@ -77,11 +77,11 @@
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s

 // Check relocatable device code generation on MacOS.
-// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
-// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
-// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-apple-macosx -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s

 // Check that CLANG forwards the -v flag to PTXAS.
@ -92,12 +92,12 @@
 // CHECK: "-cc1"
 // ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda"
 // ARCH32-SAME: "-triple" "nvptx-nvidia-cuda"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"
 // SM20-SAME: "-target-cpu" "sm_20"
 // SM35-SAME: "-target-cpu" "sm_35"
 // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
 // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"

 // Match the call to ptxas (which assembles PTX to SASS).
 // CHECK: ptxas
@ -141,7 +141,7 @@
 // ARCH64-SAME: "-triple" "x86_64-
 // ARCH32-SAME: "-triple" "i386-
 // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"

 // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"
--- a/clang/test/Driver/cuda-phases.cu
+++ b/clang/test/Driver/cuda-phases.cu
@ -11,12 +11,21 @@
 //
 // Test single gpu architecture with complete compilation.
 //
+// Test CUDA NVPTX phases.
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+//
+// Test HIP AMDGPU -fgpu-rdc phases.
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_RDC %s
+//
+// Test HIP AMDGPU -fno-gpu-rdc phases (default).
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_NRDC %s
+//
 // BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
 // BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
@ -32,12 +41,17 @@
 // BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
 // BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
 // BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_NRDC-DAG: [[P6:[0-9]+]]: linker, {[[P5]]}, image, (device-hip, [[ARCH]])
+// BIN_AMD_NRDC-DAG: [[P7:[0-9]+]]: offload, "device-hip (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, image
+// BIN_AMD_NRDC-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, hip-fatbin, (device-hip)
+// BIN_AMD_NRDC-DAG: [[P11:[0-9]+]]: offload, "host-hip (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-hip (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// BIN_AMD_NRDC-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
 // BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 // BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
-// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
-// BIN_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
+// BIN_AMD_RDC-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
+// BIN_AMD_RDC-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
+// BIN_AMD_RDC-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object

 //
 // Test single gpu architecture up to the assemble phase.
@ -46,7 +60,10 @@
 // RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 -fcuda-rdc %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
 // ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
 // ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
@ -66,7 +83,7 @@
 // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
 // BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
 // BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
@ -105,7 +122,7 @@
 // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
 // RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s -S 2>&1 \
 // RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
 // ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
 // ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
--- a/clang/test/Driver/hip-output-file-name.hip
+++ b/clang/test/Driver/hip-output-file-name.hip
@ -2,7 +2,7 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target

-// RUN: %clang -### -c -target x86_64-linux-gnu \
+// RUN: %clang -### -c -target x86_64-linux-gnu -fgpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck %s

--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@ -0,0 +1,150 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-linux-gnu -fno-gpu-rdc \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
+// RUN:   -fuse-ld=lld \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+
+//
+// Compile device code in a.cu to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_A_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+
+//
+// Compile device code in a.cu to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_A_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+
+//
+// Bundle and embed device code in host object for a.cu.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Compile device code in b.hip to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_B_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+
+//
+// Compile device code in b.hip to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_B_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+
+//
+// Bundle and embed device code in host object for b.hip.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Link host objects.
+//
+
+// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
+// CHECK-NOT: "-T" "{{.*}}.lk"
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@ -7,7 +7,7 @@
 // RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
-// RUN:   -fuse-ld=lld \
+// RUN:   -fuse-ld=lld -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s
@ -15,14 +15,14 @@
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" 
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
 // CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]

 // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
 // CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]

@ -80,7 +80,7 @@

 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
 // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*o]]"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"

 // CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
 // CHECK-SAME: {{.*}} "-T" "{{.*}}.lk"
--- a/clang/test/SemaCUDA/extern-shared.cu
+++ b/clang/test/SemaCUDA/extern-shared.cu
@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -verify %s
 // RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -verify %s

-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-rdc -verify=rdc %s
-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fcuda-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fgpu-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fgpu-rdc -verify=rdc %s

 // Most of these declarations are fine in separate compilation mode.