[AMDGPU] gfx1010 wavefrontsize intrinsic folding

Differential Revision: https://reviews.llvm.org/D63206

llvm-svn: 363588
This commit is contained in:
Stanislav Mekhanoshin 2019-06-17 17:57:50 +00:00
parent 6d741f29ec
commit a9191c8492
4 changed files with 143 additions and 16 deletions

View File

@ -53,7 +53,8 @@ FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();

View File

@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/ADT/StringSet.h"
@ -22,6 +23,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
@ -29,6 +31,7 @@
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <vector>
#include <cmath>
@ -65,6 +68,8 @@ private:
typedef llvm::AMDGPULibFunc FuncInfo;
const TargetMachine *TM;
// -fuse-native.
bool AllNative = false;
@ -134,6 +139,9 @@ private:
// __read_pipe/__write_pipe
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
// llvm.amdgcn.wavefrontsize
bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
// Get insertion point at entry.
BasicBlock::iterator getEntryIns(CallInst * UI);
// Insert an Alloc instruction.
@ -152,6 +160,8 @@ protected:
}
public:
AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
void initNativeFuncs();
@ -166,15 +176,16 @@ namespace {
class AMDGPUSimplifyLibCalls : public FunctionPass {
AMDGPULibCalls Simplifier;
const TargetOptions Options;
AMDGPULibCalls Simplifier;
public:
static char ID; // Pass identification
AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
: FunctionPass(ID), Options(Opt) {
AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
const TargetMachine *TM = nullptr)
: FunctionPass(ID), Options(Opt), Simplifier(TM) {
initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
}
@ -639,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
// Ignore indirect calls.
if (Callee == 0) return false;
FuncInfo FInfo;
if (!parseFunctionName(Callee->getName(), &FInfo))
return false;
// Further check the number of arguments to see if they match.
if (CI->getNumArgOperands() != FInfo.getNumArgs())
return false;
BasicBlock *BB = CI->getParent();
LLVMContext &Context = CI->getParent()->getContext();
IRBuilder<> B(Context);
@ -658,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
B.setFastMathFlags(FPOp->getFastMathFlags());
switch (Callee->getIntrinsicID()) {
default:
break;
case Intrinsic::amdgcn_wavefrontsize:
return !EnablePreLink && fold_wavefrontsize(CI, B);
}
FuncInfo FInfo;
if (!parseFunctionName(Callee->getName(), &FInfo))
return false;
// Further check the number of arguments to see if they match.
if (CI->getNumArgOperands() != FInfo.getNumArgs())
return false;
if (TDOFold(CI, FInfo))
return true;
@ -1371,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
return true;
}
bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
if (!TM)
return false;
StringRef CPU = TM->getTargetCPU();
StringRef Features = TM->getTargetFeatureString();
if ((CPU.empty() || CPU.equals_lower("generic")) &&
(Features.empty() ||
Features.find_lower("wavefrontsize") == StringRef::npos))
return false;
Function *F = CI->getParent()->getParent();
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
unsigned N = ST.getWavefrontSize();
LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
<< N << "\n");
CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
CI->eraseFromParent();
return true;
}
// Get insertion point at entry.
BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
Function * Func = UI->getParent()->getParent();
@ -1680,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
}
// Public interface to the Simplify LibCalls pass.
FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
return new AMDGPUSimplifyLibCalls(Opt);
FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
const TargetMachine *TM) {
return new AMDGPUSimplifyLibCalls(Opt, TM);
}
FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {

View File

@ -432,7 +432,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
PM.add(llvm::createAMDGPUUseNativeCallsPass());
if (LibCallSimplify)
PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
});
Builder.addExtension(

View File

@ -0,0 +1,84 @@
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
; GCN-LABEL: {{^}}fold_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
; GCN: store_dword v[{{[0-9:]+}}], [[V]]
; OPT-W32: store i32 32, i32 addrspace(1)* %arg, align 4
; OPT-W64: store i32 64, i32 addrspace(1)* %arg, align 4
; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT-WXX: store i32 %tmp, i32 addrspace(1)* %arg, align 4
; OPT-NEXT: ret void
define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, i32 addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
; GCN-NOT: cndmask
; GCN: store_dword v[{{[0-9:]+}}], [[V]]
; OPT-W32: store i32 1, i32 addrspace(1)* %arg, align 4
; OPT-W64: store i32 2, i32 addrspace(1)* %arg, align 4
; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32
; OPT-WXX: %tmp2 = select i1 %tmp1, i32 2, i32 1
; OPT-WXX: store i32 %tmp2, i32 addrspace(1)* %arg
; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
%tmp2 = select i1 %tmp1, i32 2, i32 1
store i32 %tmp2, i32 addrspace(1)* %arg
ret void
}
; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
; OPT: bb:
; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32
; OPT-WXX: bb3:
; OPT-W64: store i32 1, i32 addrspace(1)* %arg, align 4
; OPT-NEXT: ret void
define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
br i1 %tmp1, label %bb2, label %bb3
bb2: ; preds = %bb
store i32 1, i32 addrspace(1)* %arg, align 4
br label %bb3
bb3: ; preds = %bb2, %bb
ret void
}
declare i32 @llvm.amdgcn.wavefrontsize() #0
attributes #0 = { nounwind readnone speculatable }