[clang][HIP] Updating driver to enable archive/bitcode to bitcode linking when targeting HIPAMD toolchain

Differential Revision: https://reviews.llvm.org/D124151
2022-04-21 00:01:15 -07:00 · 2022-04-21 00:01:15 -07:00 · afcc6baac5
parent 8960ba7491
commit afcc6baac5
5 changed files with 117 additions and 9 deletions
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@ -3065,7 +3065,7 @@ class OffloadingActionBuilder final {

      // amdgcn does not support linking of object files, therefore we skip
      // backend and assemble phases to output LLVM IR. Except for generating
-      // non-relocatable device coee, where we generate fat binary for device
+      // non-relocatable device code, where we generate fat binary for device
      // code and pass to host in Backend phase.
      if (CudaDeviceActions.empty())
        return ABRT_Success;
@ -3074,7 +3074,7 @@ class OffloadingActionBuilder final {
              CudaDeviceActions.size() == GpuArchList.size()) &&
             "Expecting one action per GPU architecture.");
      assert(!CompileHostOnly &&
-             "Not expecting CUDA actions in host-only compilation.");
+             "Not expecting HIP actions in host-only compilation.");

      if (!Relocatable && CurPhase == phases::Backend && !EmitLLVM &&
          !EmitAsm) {
@ -3203,12 +3203,16 @@ class OffloadingActionBuilder final {
             "Linker inputs and GPU arch list sizes do not match.");

      ActionList Actions;
-      // Append a new link action for each device.
      unsigned I = 0;
+      // Append a new link action for each device.
+      // Each entry in DeviceLinkerInputs corresponds to a GPU arch.
      for (auto &LI : DeviceLinkerInputs) {
-        // Each entry in DeviceLinkerInputs corresponds to a GPU arch.
-        auto *DeviceLinkAction =
-            C.MakeAction<LinkJobAction>(LI, types::TY_Image);
+
+        types::ID Output = Args.hasArg(options::OPT_emit_llvm)
+                                   ? types::TY_LLVM_BC
+                                   : types::TY_Image;
+
+        auto *DeviceLinkAction = C.MakeAction<LinkJobAction>(LI, Output);
        // Linking all inputs for the current GPU arch.
        // LI contains all the inputs for the linker.
        OffloadAction::DeviceDependences DeviceLinkDeps;
@ -3220,6 +3224,12 @@ class OffloadingActionBuilder final {
      }
      DeviceLinkerInputs.clear();

+      // If emitting LLVM, do not generate final host/device compilation action
+      if (Args.hasArg(options::OPT_emit_llvm)) {
+          AL.append(Actions);
+          return;
+      }
+
      // Create a host object from all the device images by embedding them
      // in a fat binary for mixed host-device compilation. For device-only
      // compilation, creates a fat binary.
@ -3747,7 +3757,8 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args,
  phases::ID FinalPhase = getFinalPhase(Args, &FinalPhaseArg);

  if (FinalPhase == phases::Link) {
-    if (Args.hasArg(options::OPT_emit_llvm))
+    // Emitting LLVM while linking disabled except in HIPAMD Toolchain
+    if (Args.hasArg(options::OPT_emit_llvm) && !Args.hasArg(options::OPT_hip_link))
      Diag(clang::diag::err_drv_emit_llvm_link);
    if (IsCLMode() && LTOMode != LTOK_None &&
        !Args.getLastArgValue(options::OPT_fuse_ld_EQ)
@ -3932,7 +3943,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
      // Queue linker inputs.
      if (Phase == phases::Link) {
        assert(Phase == PL.back() && "linking must be final compilation step.");
-        LinkerInputs.push_back(Current);
+        // We don't need to generate additional link commands if emitting AMD bitcode
+        if (!(C.getInputArgs().hasArg(options::OPT_hip_link) &&
+             (C.getInputArgs().hasArg(options::OPT_emit_llvm))))
+          LinkerInputs.push_back(Current);
        Current = nullptr;
        break;
      }
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@ -72,6 +72,36 @@ static bool shouldSkipSanitizeOption(const ToolChain &TC,
  return false;
 }

+void AMDGCN::Linker::constructLlvmLinkCommand(Compilation &C,
+                                         const JobAction &JA,
+                                         const InputInfoList &Inputs,
+                                         const InputInfo &Output,
+                                         const llvm::opt::ArgList &Args) const {
+  // Construct llvm-link command.
+  // The output from llvm-link is a bitcode file.
+  ArgStringList LlvmLinkArgs;
+
+  assert(!Inputs.empty() && "Must have at least one input.");
+
+  LlvmLinkArgs.append({"-o", Output.getFilename()});
+  for (auto Input : Inputs)
+    LlvmLinkArgs.push_back(Input.getFilename());
+
+  // Look for archive of bundled bitcode in arguments, and add temporary files
+  // for the extracted archive of bitcode to inputs.
+  auto TargetID = Args.getLastArgValue(options::OPT_mcpu_EQ);
+  AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, LlvmLinkArgs, "amdgcn",
+                             TargetID,
+                             /*IsBitCodeSDL=*/true,
+                             /*PostClangLink=*/false);
+
+  const char *LlvmLink =
+    Args.MakeArgString(getToolChain().GetProgramPath("llvm-link"));
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         LlvmLink, LlvmLinkArgs, Inputs,
+                                         Output));
+}
+
 void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
                                         const InputInfoList &Inputs,
                                         const InputInfo &Output,
@ -135,7 +165,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
 }

 // For amdgcn the inputs of the linker job are device bitcode and output is
-// object file. It calls llvm-link, opt, llc, then lld steps.
+// either an object file or bitcode (-emit-llvm). It calls llvm-link, opt,
+// llc, then lld steps.
 void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                  const InputInfo &Output,
                                  const InputInfoList &Inputs,
@ -151,6 +182,9 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
    return HIP::constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs,
                                          Args, *this);

+  if (JA.getType() == types::TY_LLVM_BC)
+    return constructLlvmLinkCommand(C, JA, Inputs, Output, Args);
+
  return constructLldCommand(C, JA, Inputs, Output, Args);
 }

--- a/clang/lib/Driver/ToolChains/HIPAMD.h
+++ b/clang/lib/Driver/ToolChains/HIPAMD.h
@ -36,6 +36,10 @@ private:
  void constructLldCommand(Compilation &C, const JobAction &JA,
                           const InputInfoList &Inputs, const InputInfo &Output,
                           const llvm::opt::ArgList &Args) const;
+  void constructLlvmLinkCommand(Compilation &C, const JobAction &JA,
+                                const InputInfoList &Inputs,
+                                const InputInfo &Output,
+                                const llvm::opt::ArgList &Args) const;
 };

 } // end namespace AMDGCN
--- a/clang/test/Driver/hip-link-bc-to-bc.hip
+++ b/clang/test/Driver/hip-link-bc-to-bc.hip
@ -0,0 +1,34 @@
+// REQUIRES: clang-driver, x86-registered-target, amdgpu-registered-target
+
+// Check that clang unbundles the two bitcodes and links via llvm-link
+// RUN: touch %T/bundle1.bc
+// RUN: touch %T/bundle2.bc
+
+// RUN: %clang -### --offload-arch=gfx906 --hip-link \
+// RUN:   -emit-llvm -fgpu-rdc --cuda-device-only \
+// RUN:   %T/bundle1.bc %T/bundle2.bc \
+// RUN:   2>&1 | FileCheck -check-prefix=BITCODE %s
+
+// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle1.bc" "-output=[[B1HOST:.*\.bc]]" "-output=[[B1DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles"
+// BITCODE: "{{.*}}clang-{{.*}}" "-o" "[[B1DEV2:.*bundle1-gfx906.bc]]" "-x" "ir" "[[B1DEV1]]"
+
+// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle2.bc" "-output=[[B2HOST:.*\.bc]]" "-output=[[B2DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles"
+// BITCODE: "{{.*}}clang-{{.*}}" "-o" "[[B2DEV2:.*bundle2-gfx906.bc]]" "-x" "ir" "[[B2DEV1]]"
+
+// BITCODE: "{{.*}}llvm-link" "-o" "bundle1-hip-amdgcn-amd-amdhsa-gfx906.bc" "[[B1DEV2]]" "[[B2DEV2]]"
+
+// Check that clang unbundles the bitcode and archive and links via llvm-link
+// RUN: touch %T/libhipbundle.a
+// RUN: touch %T/bundle.bc
+
+// RUN: %clang -### --offload-arch=gfx906 --hip-link \
+// RUN:   -emit-llvm -fgpu-rdc --cuda-device-only \
+// RUN:   %T/bundle.bc -L%T -lhipbundle \
+// RUN:   2>&1 | FileCheck -check-prefix=ARCHIVE %s
+
+// ARCHIVE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle.bc" "-output=[[HOST:.*\.bc]]" "-output=[[DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles"
+// ARCHIVE: "{{.*}}clang-{{.*}}" "-o" "[[DEV2:.*\.bc]]" "-x" "ir" "[[DEV1]]"
+
+// ARCHIVE: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}libhipbundle.a" "-targets=hip-amdgcn-amd-amdhsa-gfx906" "-output=[[AR:.*\.a]]" "-allow-missing-bundles" "-hip-openmp-compatible"
+
+// ARCHIVE: "{{.*}}llvm-link" "-o" "bundle-hip-amdgcn-amd-amdhsa-gfx906.bc" "[[DEV2]]" "[[AR]]"
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@ -520,3 +520,25 @@
 // MIXED2-DAG: input, "{{.*}}empty.cpp", hip, (device-hip, gfx803)
 // MIXED2-DAG: input, "{{.*}}empty.cpp", hip, (device-hip, gfx900)
 // MIXED2-NEG-NOT: input, "{{.*}}empty.cpp", c++
+
+// Test HIP bitcode to bitcode linking. Input should be bundled or unbundled bitcode, and
+// output should be unbundled linked bitcode
+
+// RUN: touch %T/bitcodeA.bc
+// RUN: touch %T/bitcodeB.bc
+// RUN: %clang -ccc-print-phases --hip-link -emit-llvm --cuda-device-only \
+// RUN: --offload-arch=gfx906 %T/bitcodeA.bc %T/bitcodeB.bc 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK %s
+
+// CHECK: [[A0:[0-9]+]]: input, "{{.*}}bitcodeA.bc", ir
+// CHECK: [[A1:[0-9]+]]: clang-offload-unbundler, {[[A0]]}, ir
+// CHECK: [[A2:[0-9]+]]: compiler, {[[A1]]}, ir, (device-hip, [[ARCH:gfx906]])
+// CHECK: [[A3:[0-9]+]]: backend, {[[A2]]}, ir, (device-hip, [[ARCH]])
+
+// CHECK: [[B0:[0-9]+]]: input, "{{.*}}bitcodeB.bc", ir
+// CHECK: [[B1:[0-9]+]]: clang-offload-unbundler, {[[B0]]}, ir
+// CHECK: [[B2:[0-9]+]]: compiler, {[[B1]]}, ir, (device-hip, [[ARCH]])
+// CHECK: [[B3:[0-9]+]]: backend, {[[B2]]}, ir, (device-hip, [[ARCH]])
+
+// CHECK: [[L0:[0-9]+]]: linker, {[[A3]], [[B3]]}, ir, (device-hip, [[ARCH]])
+// CHECK: offload, "device-hip (amdgcn-amd-amdhsa:[[ARCH]])" {[[L0]]}, ir