[AMDGPU] Pass to propagate ABI attributes from kernels to the functions

The pass works in two modes: Mode 1: Just set attributes starting from kernels. This can work at the very beginning of opt and llc pipeline, but cannot clone functions because it must be a function pass. Mode 2: Actually clone functions for new attributes. This can only work after all function passes in the opt pipeline because it has to be a module pass. Differential Revision: https://reviews.llvm.org/D63208 llvm-svn: 363586
2019-06-17 17:47:28 +00:00 · 2019-06-17 17:47:28 +00:00 · ad04e7ad42
parent b8e8b1769f
commit ad04e7ad42
6 changed files with 515 additions and 4 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -57,6 +57,8 @@ FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
 ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
 FunctionPass *createSIModeRegisterPass();
@ -91,6 +93,12 @@ ModulePass *createAMDGPULowerKernelAttributesPass();
 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
 extern char &AMDGPULowerKernelAttributesID;
 void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &);
 extern char &AMDGPUPropagateAttributesEarlyID;
 void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &);
 extern char &AMDGPUPropagateAttributesLateID;
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@ -0,0 +1,336 @@
 //===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// \brief This pass propagates attributes from kernels to the non-entry
 /// functions. Most of the library functions were not compiled for specific ABI,
 /// yet will be correctly compiled if proper attrbutes are propagated from the
 /// caller.
 ///
 /// The pass analyzes call graph and propagates ABI target features through the
 /// call graph.
 ///
 /// It can run in two modes: as a function or module pass. A function pass
 /// simply propagates attributes. A module pass clones functions if there are
 /// callers with different ABI. If a function is clonned all call sites will
 /// be updated to use a correct clone.
 ///
 /// A function pass is limited in functionality but can run early in the
 /// pipeline. A module pass is more powerful but has to run late, so misses
 /// library folding opportunities.
 //
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "amdgpu-propagate-attributes"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <string>
 using namespace llvm;
 namespace llvm {
 extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
 }
 namespace {
 class AMDGPUPropagateAttributes {
  const FeatureBitset TargetFeatures = {
    AMDGPU::FeatureWavefrontSize16,
    AMDGPU::FeatureWavefrontSize32,
    AMDGPU::FeatureWavefrontSize64
  };
  class Clone{
  public:
    Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) :
      FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {}
    FeatureBitset FeatureMask;
    Function *OrigF;
    Function *NewF;
  };
  const TargetMachine *TM;
  // Clone functions as needed or just set attributes.
  bool AllowClone;
  // Option propagation roots.
  SmallSet<Function *, 32> Roots;
  // Clones of functions with their attributes.
  SmallVector<Clone, 32> Clones;
  // Find a clone with required features.
  Function *findFunction(const FeatureBitset &FeaturesNeeded,
                         Function *OrigF);
  // Clone function F and set NewFeatures on the clone.
  // Cole takes the name of original function.
  Function *cloneWithFeatures(Function &F,
                              const FeatureBitset &NewFeatures);
  // Set new function's features in place.
  void setFeatures(Function &F, const FeatureBitset &NewFeatures);
  std::string getFeatureString(const FeatureBitset &Features) const;
  // Propagate attributes from Roots.
  bool process();
 public:
  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
    TM(TM), AllowClone(AllowClone) {}
  // Use F as a root and propagate its attributes.
  bool process(Function &F);
  // Propagate attributes starting from kernel functions.
  bool process(Module &M);
 };
 // Allows to propagate attributes early, but no clonning is allowed as it must
 // be a function pass to run before any optimizations.
 // TODO: We shall only need a one instance of module pass, but that needs to be
 // in the linker pipeline which is currently not possible.
 class AMDGPUPropagateAttributesEarly : public FunctionPass {
  const TargetMachine *TM;
 public:
  static char ID; // Pass identification
  AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) :
    FunctionPass(ID), TM(TM) {
    initializeAMDGPUPropagateAttributesEarlyPass(
      *PassRegistry::getPassRegistry());
  }
  bool runOnFunction(Function &F) override;
 };
 // Allows to propagate attributes with clonning but does that late in the
 // pipeline.
 class AMDGPUPropagateAttributesLate : public ModulePass {
  const TargetMachine *TM;
 public:
  static char ID; // Pass identification
  AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) :
    ModulePass(ID), TM(TM) {
    initializeAMDGPUPropagateAttributesLatePass(
      *PassRegistry::getPassRegistry());
  }
  bool runOnModule(Module &M) override;
 };
 }  // end anonymous namespace.
 char AMDGPUPropagateAttributesEarly::ID = 0;
 char AMDGPUPropagateAttributesLate::ID = 0;
 INITIALIZE_PASS(AMDGPUPropagateAttributesEarly,
                "amdgpu-propagate-attributes-early",
                "Early propagate attributes from kernels to functions",
                false, false)
 INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
                "amdgpu-propagate-attributes-late",
                "Late propagate attributes from kernels to functions",
                false, false)
 Function *
 AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded,
                                        Function *OrigF) {
  // TODO: search for clone's clones.
  for (Clone &C : Clones)
    if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask)
      return C.NewF;
  return nullptr;
 }
 bool AMDGPUPropagateAttributes::process(Module &M) {
  for (auto &F : M.functions())
    if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
      Roots.insert(&F);
  return process();
 }
 bool AMDGPUPropagateAttributes::process(Function &F) {
  Roots.insert(&F);
  return process();
 }
 bool AMDGPUPropagateAttributes::process() {
  bool Changed = false;
  SmallSet<Function *, 32> NewRoots;
  SmallSet<Function *, 32> Replaced;
  if (Roots.empty())
    return false;
  Module &M = *(*Roots.begin())->getParent();
  do {
    Roots.insert(NewRoots.begin(), NewRoots.end());
    NewRoots.clear();
    for (auto &F : M.functions()) {
      if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F))
        continue;
      const FeatureBitset &CalleeBits =
        TM->getSubtargetImpl(F)->getFeatureBits();
      SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
      for (User *U : F.users()) {
        Instruction *I = dyn_cast<Instruction>(U);
        if (!I)
          continue;
        CallBase *CI = dyn_cast<CallBase>(I);
        if (!CI)
          continue;
        Function *Caller = CI->getCaller();
        if (!Caller)
          continue;
        if (!Roots.count(Caller))
          continue;
        const FeatureBitset &CallerBits =
          TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures;
        if (CallerBits == (CalleeBits  & TargetFeatures)) {
          NewRoots.insert(&F);
          continue;
        }
        Function *NewF = findFunction(CallerBits, &F);
        if (!NewF) {
          FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) |
                                    CallerBits);
          if (!AllowClone) {
            // This may set different features on different iteartions if
            // there is a contradiction in callers' attributes. In this case
            // we rely on a second pass running on Module, which is allowed
            // to clone.
            setFeatures(F, NewFeatures);
            NewRoots.insert(&F);
            Changed = true;
            break;
          }
          NewF = cloneWithFeatures(F, NewFeatures);
          Clones.push_back(Clone(CallerBits, &F, NewF));
          NewRoots.insert(NewF);
        }
        ToReplace.push_back(std::make_pair(CI, NewF));
        Replaced.insert(&F);
        Changed = true;
      }
      while (!ToReplace.empty()) {
        auto R = ToReplace.pop_back_val();
        R.first->setCalledFunction(R.second);
      }
    }
  } while (!NewRoots.empty());
  for (Function *F : Replaced) {
    if (F->use_empty())
      F->eraseFromParent();
  }
  return Changed;
 }
 Function *
 AMDGPUPropagateAttributes::cloneWithFeatures(Function &F,
                                             const FeatureBitset &NewFeatures) {
  LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');
  ValueToValueMapTy dummy;
  Function *NewF = CloneFunction(&F, dummy);
  setFeatures(*NewF, NewFeatures);
  // Swap names. If that is the only clone it will retain the name of now
  // dead value.
  if (F.hasName()) {
    std::string NewName = NewF->getName();
    NewF->takeName(&F);
    F.setName(NewName);
    // Name has changed, it does not need an external symbol.
    F.setVisibility(GlobalValue::DefaultVisibility);
    F.setLinkage(GlobalValue::InternalLinkage);
  }
  return NewF;
 }
 void AMDGPUPropagateAttributes::setFeatures(Function &F,
                                            const FeatureBitset &NewFeatures) {
  std::string NewFeatureStr = getFeatureString(NewFeatures);
  LLVM_DEBUG(dbgs() << "Set features "
                    << getFeatureString(NewFeatures & TargetFeatures)
                    << " on " << F.getName() << '\n');
  F.removeFnAttr("target-features");
  F.addFnAttr("target-features", NewFeatureStr);
 }
 std::string
 AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
 {
  std::string Ret;
  for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) {
    if (Features[KV.Value])
      Ret += (StringRef("+") + KV.Key + ",").str();
    else if (TargetFeatures[KV.Value])
      Ret += (StringRef("-") + KV.Key + ",").str();
  }
  Ret.pop_back(); // Remove last comma.
  return Ret;
 }
 bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
  if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
    return false;
  return AMDGPUPropagateAttributes(TM, false).process(F);
 }
 bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
  if (!TM)
    return false;
  return AMDGPUPropagateAttributes(TM, true).process(M);
 }
 FunctionPass
 *llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) {
  return new AMDGPUPropagateAttributesEarly(TM);
 }
 ModulePass
 *llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
  return new AMDGPUPropagateAttributesLate(TM);
 }
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -217,6 +217,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
  initializeAMDGPUPromoteAllocaPass(*PR);
  initializeAMDGPUCodeGenPreparePass(*PR);
  initializeAMDGPUPropagateAttributesEarlyPass(*PR);
  initializeAMDGPUPropagateAttributesLatePass(*PR);
  initializeAMDGPURewriteOutArgumentsPass(*PR);
  initializeAMDGPUUnifyMetadataPass(*PR);
  initializeSIAnnotateControlFlowPass(*PR);
@ -402,13 +404,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
  Builder.addExtension(
    PassManagerBuilder::EP_ModuleOptimizerEarly,
-    [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
+    [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
-                                         legacy::PassManagerBase &PM) {
+                                               legacy::PassManagerBase &PM) {
      if (AMDGPUAA) {
        PM.add(createAMDGPUAAWrapperPass());
        PM.add(createAMDGPUExternalAAWrapperPass());
      }
      PM.add(createAMDGPUUnifyMetadataPass());
      PM.add(createAMDGPUPropagateAttributesLatePass(this));
      if (Internalize) {
        PM.add(createInternalizePass(mustPreserveGV));
        PM.add(createGlobalDCEPass());
@ -420,12 +423,13 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
  const auto &Opt = Options;
  Builder.addExtension(
    PassManagerBuilder::EP_EarlyAsPossible,
-    [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
+    [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
-                                      legacy::PassManagerBase &PM) {
+                                            legacy::PassManagerBase &PM) {
      if (AMDGPUAA) {
        PM.add(createAMDGPUAAWrapperPass());
        PM.add(createAMDGPUExternalAAWrapperPass());
      }
      PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
      PM.add(llvm::createAMDGPUUseNativeCallsPass());
      if (LibCallSimplify)
        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
@ -654,6 +658,9 @@ void AMDGPUPassConfig::addIRPasses() {
  disablePass(&FuncletLayoutID);
  disablePass(&PatchableFunctionID);
  // A call to propagate attributes pass in the backend in case opt was not run.
  addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
  addPass(createAtomicExpandPass());
  // This must occur before inlining, as the inliner will not look through
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
  AMDGPUMCInstLower.cpp
  AMDGPUOpenCLEnqueuedBlockLowering.cpp
  AMDGPUPromoteAlloca.cpp
  AMDGPUPropagateAttributes.cpp
  AMDGPURegAsmNames.inc.cpp
  AMDGPURegisterBankInfo.cpp
  AMDGPURegisterInfo.cpp
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll
@ -0,0 +1,87 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 < %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s
 ; OPT: declare void @foo4() local_unnamed_addr #0
 ; OPT: define internal fastcc void @foo3.2() unnamed_addr #1
 ; OPT: define void @foo2() local_unnamed_addr #1
 ; OPT: define internal fastcc void @foo1.1() unnamed_addr #1
 ; OPT: define amdgpu_kernel void @kernel1() local_unnamed_addr #2
 ; OPT: define amdgpu_kernel void @kernel2() local_unnamed_addr #3
 ; OPT: define amdgpu_kernel void @kernel3() local_unnamed_addr #3
 ; OPT: define void @foo1() local_unnamed_addr #4
 ; OPT: define void @foo3() local_unnamed_addr #4
 ; OPT: attributes #0 = { {{.*}} "target-features"="+wavefrontsize64" }
 ; OPT: attributes #1 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,-wavefrontsize32,+wavefrontsize64{{.*}}" }
 ; OPT: attributes #2 = { {{.*}} "target-features"="+wavefrontsize32" }
 ; OPT: attributes #3 = { {{.*}} "target-features"="+wavefrontsize64" }
 ; OPT: attributes #4 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,+wavefrontsize32,-wavefrontsize64{{.*}}" }
 ; LLC: foo3:
 ; LLC: sample asm
 ; LLC: foo2:
 ; LLC: sample asm
 ; LLC: foo1:
 ; LLC: foo4@gotpcrel32@lo+4
 ; LLC: foo4@gotpcrel32@hi+4
 ; LLC: foo3@gotpcrel32@lo+4
 ; LLC: foo3@gotpcrel32@hi+4
 ; LLC: foo2@gotpcrel32@lo+4
 ; LLC: foo2@gotpcrel32@hi+4
 ; LLC: foo1@gotpcrel32@lo+4
 ; LLC: foo1@gotpcrel32@hi+4
 ; LLC: kernel1:
 ; LLC: foo1@gotpcrel32@lo+4
 ; LLC: foo1@gotpcrel32@hi+4
 ; LLC: kernel2:
 ; LLC: foo2@gotpcrel32@lo+4
 ; LLC: foo2@gotpcrel32@hi+4
 ; LLC: kernel3:
 ; LLC: foo1@gotpcrel32@lo+4
 ; LLC: foo1@gotpcrel32@hi+4
 declare void @foo4() #1
 define void @foo3() #1 {
 entry:
  call void asm sideeffect "; sample asm", ""()
  ret void
 }
 define void @foo2() #1 {
 entry:
  call void asm sideeffect "; sample asm", ""()
  ret void
 }
 define void @foo1() #1 {
 entry:
  tail call void @foo4()
  tail call void @foo3()
  tail call void @foo2()
  tail call void @foo2()
  tail call void @foo1()
  ret void
 }
 define amdgpu_kernel void @kernel1() #0 {
 entry:
  tail call void @foo1()
  ret void
 }
 define amdgpu_kernel void @kernel2() #2 {
 entry:
  tail call void @foo2()
  ret void
 }
 define amdgpu_kernel void @kernel3() #3 {
 entry:
  tail call void @foo1()
  ret void
 }
 attributes #0 = { nounwind "target-features"="+wavefrontsize32" }
 attributes #1 = { noinline nounwind "target-features"="+wavefrontsize64" }
 attributes #2 = { nounwind "target-features"="+wavefrontsize64" }
 attributes #3 = { nounwind "target-features"="+wavefrontsize64" }
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll
@ -0,0 +1,72 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 < %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s
 ; OPT: declare void @foo4() local_unnamed_addr #0
 ; OPT: define void @foo3() local_unnamed_addr #1
 ; OPT: define void @foo2() local_unnamed_addr #1
 ; OPT: define void @foo1() local_unnamed_addr #1
 ; OPT: define amdgpu_kernel void @kernel1() local_unnamed_addr #2
 ; OPT: define amdgpu_kernel void @kernel2() local_unnamed_addr #2
 ; OPT: attributes #0 = { {{.*}} "target-features"="+wavefrontsize64" }
 ; OPT: attributes #1 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,+wavefrontsize32,-wavefrontsize64
 ; OPT: attributes #2 = { {{.*}} "target-features"="+wavefrontsize32
 ; OPT: attributes #3 = { nounwind }
 ; LLC: foo3:
 ; LLC: sample asm
 ; LLC: foo2:
 ; LLC: sample asm
 ; LLC: foo1:
 ; LLC: foo4@gotpcrel32@lo+4
 ; LLC: foo4@gotpcrel32@hi+4
 ; LLC: foo3@gotpcrel32@lo+4
 ; LLC: foo3@gotpcrel32@hi+4
 ; LLC: foo2@gotpcrel32@lo+4
 ; LLC: foo2@gotpcrel32@hi+4
 ; LLC: foo1@gotpcrel32@lo+4
 ; LLC: foo1@gotpcrel32@hi+4
 ; LLC: kernel1:
 ; LLC: foo1@gotpcrel32@lo+4
 ; LLC: foo1@gotpcrel32@hi+4
 ; LLC: kernel2:
 ; LLC: foo2@gotpcrel32@lo+4
 ; LLC: foo2@gotpcrel32@hi+4
 declare void @foo4() #1
 define void @foo3() #1 {
 entry:
  call void asm sideeffect "; sample asm", ""()
  ret void
 }
 define void @foo2() #1 {
 entry:
  call void asm sideeffect "; sample asm", ""()
  ret void
 }
 define void @foo1() #1 {
 entry:
  tail call void @foo4()
  tail call void @foo3()
  tail call void @foo2()
  tail call void @foo2()
  tail call void @foo1()
  ret void
 }
 define amdgpu_kernel void @kernel1() #0 {
 entry:
  tail call void @foo1()
  ret void
 }
 define amdgpu_kernel void @kernel2() #0 {
 entry:
  tail call void @foo2()
  ret void
 }
 attributes #0 = { nounwind "target-features"="+wavefrontsize32" }
 attributes #1 = { noinline nounwind "target-features"="+wavefrontsize64" }