[AMDGPU] gfx1010 wavefrontsize intrinsic folding

Differential Revision: https://reviews.llvm.org/D63206 llvm-svn: 363588
2019-06-17 17:57:50 +00:00 · 2019-06-17 17:57:50 +00:00 · a9191c8492
parent 6d741f29ec
commit a9191c8492
4 changed files with 143 additions and 16 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -53,7 +53,8 @@ FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
+FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
+                                               const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@ -15,6 +15,7 @@

 #include "AMDGPU.h"
 #include "AMDGPULibFunc.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/ADT/StringSet.h"
@ -22,6 +23,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@ -29,6 +31,7 @@
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <vector>
 #include <cmath>
@ -65,6 +68,8 @@ private:

  typedef llvm::AMDGPULibFunc FuncInfo;

+  const TargetMachine *TM;
+
  // -fuse-native.
  bool AllNative = false;

@ -134,6 +139,9 @@ private:
  // __read_pipe/__write_pipe
  bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);

+  // llvm.amdgcn.wavefrontsize
+  bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
+
  // Get insertion point at entry.
  BasicBlock::iterator getEntryIns(CallInst * UI);
  // Insert an Alloc instruction.
@ -152,6 +160,8 @@ protected:
  }

 public:
+  AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
+
  bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);

  void initNativeFuncs();
@ -166,15 +176,16 @@ namespace {

  class AMDGPUSimplifyLibCalls : public FunctionPass {

-  AMDGPULibCalls Simplifier;
-
  const TargetOptions Options;

+  AMDGPULibCalls Simplifier;
+
  public:
    static char ID; // Pass identification

-    AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
-      : FunctionPass(ID), Options(Opt) {
+    AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
+                           const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), Options(Opt), Simplifier(TM) {
      initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
    }

@ -639,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
  // Ignore indirect calls.
  if (Callee == 0) return false;

-  FuncInfo FInfo;
-  if (!parseFunctionName(Callee->getName(), &FInfo))
-    return false;
-
-  // Further check the number of arguments to see if they match.
-  if (CI->getNumArgOperands() != FInfo.getNumArgs())
-    return false;
-
  BasicBlock *BB = CI->getParent();
  LLVMContext &Context = CI->getParent()->getContext();
  IRBuilder<> B(Context);
@ -658,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
    B.setFastMathFlags(FPOp->getFastMathFlags());

+  switch (Callee->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::amdgcn_wavefrontsize:
+    return !EnablePreLink && fold_wavefrontsize(CI, B);
+  }
+
+  FuncInfo FInfo;
+  if (!parseFunctionName(Callee->getName(), &FInfo))
+    return false;
+
+  // Further check the number of arguments to see if they match.
+  if (CI->getNumArgOperands() != FInfo.getNumArgs())
+    return false;
+
  if (TDOFold(CI, FInfo))
    return true;

@ -1371,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
  return true;
 }

+bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
+  if (!TM)
+    return false;
+
+  StringRef CPU = TM->getTargetCPU();
+  StringRef Features = TM->getTargetFeatureString();
+  if ((CPU.empty() || CPU.equals_lower("generic")) &&
+      (Features.empty() ||
+       Features.find_lower("wavefrontsize") == StringRef::npos))
+    return false;
+
+  Function *F = CI->getParent()->getParent();
+  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
+  unsigned N = ST.getWavefrontSize();
+
+  LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
+               << N << "\n");
+
+  CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
+  CI->eraseFromParent();
+  return true;
+}
+
 // Get insertion point at entry.
 BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
  Function * Func = UI->getParent()->getParent();
@ -1680,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
 }

 // Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
-  return new AMDGPUSimplifyLibCalls(Opt);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
+                                                     const TargetMachine *TM) {
+  return new AMDGPUSimplifyLibCalls(Opt, TM);
 }

 FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -432,7 +432,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
      PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
      PM.add(llvm::createAMDGPUUseNativeCallsPass());
      if (LibCallSimplify)
-        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
+        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
  });

  Builder.addExtension(
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
+
+; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+
+; GCN-LABEL: {{^}}fold_wavefrontsize:
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+
+; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
+; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
+; GCN:       store_dword v[{{[0-9:]+}}], [[V]]
+
+; OPT-W32:   store i32 32, i32 addrspace(1)* %arg, align 4
+; OPT-W64:   store i32 64, i32 addrspace(1)* %arg, align 4
+; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT-WXX:   store i32 %tmp, i32 addrspace(1)* %arg, align 4
+; OPT-NEXT:  ret void
+
+define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  store i32 %tmp, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+
+; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
+; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
+; GCN-NOT:   cndmask
+; GCN:       store_dword v[{{[0-9:]+}}], [[V]]
+
+; OPT-W32:   store i32 1, i32 addrspace(1)* %arg, align 4
+; OPT-W64:   store i32 2, i32 addrspace(1)* %arg, align 4
+; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT-WXX:   %tmp2 = select i1 %tmp1, i32 2, i32 1
+; OPT-WXX:   store i32 %tmp2, i32 addrspace(1)* %arg
+; OPT-NEXT:  ret void
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+
+; OPT:       bb:
+; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT-WXX:   bb3:
+; OPT-W64:   store i32 1, i32 addrspace(1)* %arg, align 4
+; OPT-NEXT:  ret void
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  br i1 %tmp1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }