forked from OSchip/llvm-project
[AMDGPU] gfx1010 wavefrontsize intrinsic folding
Differential Revision: https://reviews.llvm.org/D63206 llvm-svn: 363588
This commit is contained in:
parent
6d741f29ec
commit
a9191c8492
|
@ -53,7 +53,8 @@ FunctionPass *createSIMemoryLegalizerPass();
|
|||
FunctionPass *createSIInsertWaitcntsPass();
|
||||
FunctionPass *createSIPreAllocateWWMRegsPass();
|
||||
FunctionPass *createSIFormMemoryClausesPass();
|
||||
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
|
||||
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
|
||||
const TargetMachine *);
|
||||
FunctionPass *createAMDGPUUseNativeCallsPass();
|
||||
FunctionPass *createAMDGPUCodeGenPreparePass();
|
||||
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPULibFunc.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/Loads.h"
|
||||
#include "llvm/ADT/StringSet.h"
|
||||
|
@ -22,6 +23,7 @@
|
|||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Intrinsics.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
|
@ -29,6 +31,7 @@
|
|||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
|
@ -65,6 +68,8 @@ private:
|
|||
|
||||
typedef llvm::AMDGPULibFunc FuncInfo;
|
||||
|
||||
const TargetMachine *TM;
|
||||
|
||||
// -fuse-native.
|
||||
bool AllNative = false;
|
||||
|
||||
|
@ -134,6 +139,9 @@ private:
|
|||
// __read_pipe/__write_pipe
|
||||
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
|
||||
|
||||
// llvm.amdgcn.wavefrontsize
|
||||
bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
|
||||
|
||||
// Get insertion point at entry.
|
||||
BasicBlock::iterator getEntryIns(CallInst * UI);
|
||||
// Insert an Alloc instruction.
|
||||
|
@ -152,6 +160,8 @@ protected:
|
|||
}
|
||||
|
||||
public:
|
||||
AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
|
||||
|
||||
bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
|
||||
|
||||
void initNativeFuncs();
|
||||
|
@ -166,15 +176,16 @@ namespace {
|
|||
|
||||
class AMDGPUSimplifyLibCalls : public FunctionPass {
|
||||
|
||||
AMDGPULibCalls Simplifier;
|
||||
|
||||
const TargetOptions Options;
|
||||
|
||||
AMDGPULibCalls Simplifier;
|
||||
|
||||
public:
|
||||
static char ID; // Pass identification
|
||||
|
||||
AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
|
||||
: FunctionPass(ID), Options(Opt) {
|
||||
AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
|
||||
const TargetMachine *TM = nullptr)
|
||||
: FunctionPass(ID), Options(Opt), Simplifier(TM) {
|
||||
initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
|
@ -639,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
|
|||
// Ignore indirect calls.
|
||||
if (Callee == 0) return false;
|
||||
|
||||
FuncInfo FInfo;
|
||||
if (!parseFunctionName(Callee->getName(), &FInfo))
|
||||
return false;
|
||||
|
||||
// Further check the number of arguments to see if they match.
|
||||
if (CI->getNumArgOperands() != FInfo.getNumArgs())
|
||||
return false;
|
||||
|
||||
BasicBlock *BB = CI->getParent();
|
||||
LLVMContext &Context = CI->getParent()->getContext();
|
||||
IRBuilder<> B(Context);
|
||||
|
@ -658,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
|
|||
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
|
||||
B.setFastMathFlags(FPOp->getFastMathFlags());
|
||||
|
||||
switch (Callee->getIntrinsicID()) {
|
||||
default:
|
||||
break;
|
||||
case Intrinsic::amdgcn_wavefrontsize:
|
||||
return !EnablePreLink && fold_wavefrontsize(CI, B);
|
||||
}
|
||||
|
||||
FuncInfo FInfo;
|
||||
if (!parseFunctionName(Callee->getName(), &FInfo))
|
||||
return false;
|
||||
|
||||
// Further check the number of arguments to see if they match.
|
||||
if (CI->getNumArgOperands() != FInfo.getNumArgs())
|
||||
return false;
|
||||
|
||||
if (TDOFold(CI, FInfo))
|
||||
return true;
|
||||
|
||||
|
@ -1371,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
|
||||
if (!TM)
|
||||
return false;
|
||||
|
||||
StringRef CPU = TM->getTargetCPU();
|
||||
StringRef Features = TM->getTargetFeatureString();
|
||||
if ((CPU.empty() || CPU.equals_lower("generic")) &&
|
||||
(Features.empty() ||
|
||||
Features.find_lower("wavefrontsize") == StringRef::npos))
|
||||
return false;
|
||||
|
||||
Function *F = CI->getParent()->getParent();
|
||||
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
|
||||
unsigned N = ST.getWavefrontSize();
|
||||
|
||||
LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
|
||||
<< N << "\n");
|
||||
|
||||
CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
|
||||
CI->eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Get insertion point at entry.
|
||||
BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
|
||||
Function * Func = UI->getParent()->getParent();
|
||||
|
@ -1680,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
|
|||
}
|
||||
|
||||
// Public interface to the Simplify LibCalls pass.
|
||||
FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
|
||||
return new AMDGPUSimplifyLibCalls(Opt);
|
||||
FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
|
||||
const TargetMachine *TM) {
|
||||
return new AMDGPUSimplifyLibCalls(Opt, TM);
|
||||
}
|
||||
|
||||
FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
|
||||
|
|
|
@ -432,7 +432,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
|
|||
PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
|
||||
PM.add(llvm::createAMDGPUUseNativeCallsPass());
|
||||
if (LibCallSimplify)
|
||||
PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
|
||||
PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
|
||||
});
|
||||
|
||||
Builder.addExtension(
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
|
||||
|
||||
; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
|
||||
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
|
||||
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
|
||||
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
|
||||
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
|
||||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
|
||||
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
|
||||
|
||||
; GCN-LABEL: {{^}}fold_wavefrontsize:
|
||||
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
|
||||
|
||||
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
|
||||
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[V]]
|
||||
|
||||
; OPT-W32: store i32 32, i32 addrspace(1)* %arg, align 4
|
||||
; OPT-W64: store i32 64, i32 addrspace(1)* %arg, align 4
|
||||
; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
|
||||
; OPT-WXX: store i32 %tmp, i32 addrspace(1)* %arg, align 4
|
||||
; OPT-NEXT: ret void
|
||||
|
||||
define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
|
||||
store i32 %tmp, i32 addrspace(1)* %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
|
||||
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
|
||||
|
||||
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
|
||||
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
|
||||
; GCN-NOT: cndmask
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[V]]
|
||||
|
||||
; OPT-W32: store i32 1, i32 addrspace(1)* %arg, align 4
|
||||
; OPT-W64: store i32 2, i32 addrspace(1)* %arg, align 4
|
||||
; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
|
||||
; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32
|
||||
; OPT-WXX: %tmp2 = select i1 %tmp1, i32 2, i32 1
|
||||
; OPT-WXX: store i32 %tmp2, i32 addrspace(1)* %arg
|
||||
; OPT-NEXT: ret void
|
||||
|
||||
define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
|
||||
%tmp1 = icmp ugt i32 %tmp, 32
|
||||
%tmp2 = select i1 %tmp1, i32 2, i32 1
|
||||
store i32 %tmp2, i32 addrspace(1)* %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
|
||||
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
|
||||
|
||||
; OPT: bb:
|
||||
; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
|
||||
; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32
|
||||
; OPT-WXX: bb3:
|
||||
; OPT-W64: store i32 1, i32 addrspace(1)* %arg, align 4
|
||||
; OPT-NEXT: ret void
|
||||
|
||||
define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
|
||||
%tmp1 = icmp ugt i32 %tmp, 32
|
||||
br i1 %tmp1, label %bb2, label %bb3
|
||||
|
||||
bb2: ; preds = %bb
|
||||
store i32 1, i32 addrspace(1)* %arg, align 4
|
||||
br label %bb3
|
||||
|
||||
bb3: ; preds = %bb2, %bb
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.wavefrontsize() #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
Loading…
Reference in New Issue