2016-06-24 15:07:55 +08:00
|
|
|
//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// This pass does misc. AMDGPU optimizations on IR before instruction
|
|
|
|
/// selection.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUSubtarget.h"
|
2016-07-20 07:16:53 +08:00
|
|
|
#include "AMDGPUTargetMachine.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/ADT/StringRef.h"
|
2016-06-24 15:07:55 +08:00
|
|
|
#include "llvm/Analysis/DivergenceAnalysis.h"
|
2017-07-27 05:07:28 +08:00
|
|
|
#include "llvm/Analysis/Loads.h"
|
2016-06-24 15:07:55 +08:00
|
|
|
#include "llvm/CodeGen/Passes.h"
|
2017-05-19 01:21:13 +08:00
|
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/IR/Attributes.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
|
|
#include "llvm/IR/Function.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/IR/IRBuilder.h"
|
|
|
|
#include "llvm/IR/InstVisitor.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/IR/InstrTypes.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
|
|
|
#include "llvm/IR/Operator.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
|
|
|
#include "llvm/Pass.h"
|
|
|
|
#include "llvm/Support/Casting.h"
|
|
|
|
#include <cassert>
|
|
|
|
#include <iterator>
|
2016-06-24 15:07:55 +08:00
|
|
|
|
|
|
|
#define DEBUG_TYPE "amdgpu-codegenprepare"
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
class AMDGPUCodeGenPrepare : public FunctionPass,
|
2016-07-20 07:16:53 +08:00
|
|
|
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
|
2017-01-21 01:52:16 +08:00
|
|
|
const SISubtarget *ST = nullptr;
|
|
|
|
DivergenceAnalysis *DA = nullptr;
|
|
|
|
Module *Mod = nullptr;
|
|
|
|
bool HasUnsafeFPMath = false;
|
2017-07-27 05:07:28 +08:00
|
|
|
AMDGPUAS AMDGPUASI;
|
2016-06-24 15:07:55 +08:00
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
|
|
|
|
/// binary operation \p V.
|
2016-09-29 04:05:39 +08:00
|
|
|
///
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \returns Binary operation \p V.
|
|
|
|
/// \returns \p T's base element bit width.
|
|
|
|
unsigned getBaseElementBitWidth(const Type *T) const;
|
2016-09-29 04:05:39 +08:00
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \returns Equivalent 32 bit integer type for given type \p T. For example,
|
|
|
|
/// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
|
|
|
|
/// is returned.
|
2016-09-29 04:05:39 +08:00
|
|
|
Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
|
|
|
|
|
|
|
|
/// \returns True if binary operation \p I is a signed binary operation, false
|
|
|
|
/// otherwise.
|
|
|
|
bool isSigned(const BinaryOperator &I) const;
|
|
|
|
|
|
|
|
/// \returns True if the condition of 'select' operation \p I comes from a
|
|
|
|
/// signed 'icmp' operation, false otherwise.
|
|
|
|
bool isSigned(const SelectInst &I) const;
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \returns True if type \p T needs to be promoted to 32 bit integer type,
|
|
|
|
/// false otherwise.
|
|
|
|
bool needsPromotionToI32(const Type *T) const;
|
|
|
|
|
|
|
|
/// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
|
|
|
|
/// operation.
|
|
|
|
///
|
|
|
|
/// \details \p I's base element bit width must be greater than 1 and less
|
|
|
|
/// than or equal 16. Promotion is done by sign or zero extending operands to
|
|
|
|
/// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
|
|
|
|
/// truncating the result of 32 bit binary operation back to \p I's original
|
|
|
|
/// type. Division operation is not promoted.
|
2016-09-29 04:05:39 +08:00
|
|
|
///
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \returns True if \p I is promoted to equivalent 32 bit binary operation,
|
|
|
|
/// false otherwise.
|
|
|
|
bool promoteUniformOpToI32(BinaryOperator &I) const;
|
2016-09-29 04:05:39 +08:00
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
|
|
|
|
///
|
|
|
|
/// \details \p I's base element bit width must be greater than 1 and less
|
|
|
|
/// than or equal 16. Promotion is done by sign or zero extending operands to
|
|
|
|
/// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
|
2016-09-29 04:05:39 +08:00
|
|
|
///
|
|
|
|
/// \returns True.
|
2016-10-07 22:22:58 +08:00
|
|
|
bool promoteUniformOpToI32(ICmpInst &I) const;
|
2016-09-29 04:05:39 +08:00
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
|
|
|
|
/// operation.
|
|
|
|
///
|
|
|
|
/// \details \p I's base element bit width must be greater than 1 and less
|
|
|
|
/// than or equal 16. Promotion is done by sign or zero extending operands to
|
|
|
|
/// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
|
|
|
|
/// result of 32 bit 'select' operation back to \p I's original type.
|
2016-09-29 04:05:39 +08:00
|
|
|
///
|
|
|
|
/// \returns True.
|
2016-10-07 22:22:58 +08:00
|
|
|
bool promoteUniformOpToI32(SelectInst &I) const;
|
2016-10-06 10:20:46 +08:00
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
/// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
|
|
|
|
/// intrinsic.
|
|
|
|
///
|
|
|
|
/// \details \p I's base element bit width must be greater than 1 and less
|
|
|
|
/// than or equal 16. Promotion is done by zero extending the operand to 32
|
|
|
|
/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
|
|
|
|
/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
|
|
|
|
/// shift amount is 32 minus \p I's base element bit width), and truncating
|
|
|
|
/// the result of the shift operation back to \p I's original type.
|
2016-10-06 10:20:46 +08:00
|
|
|
///
|
|
|
|
/// \returns True.
|
2016-10-07 22:22:58 +08:00
|
|
|
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
|
2017-07-27 05:07:28 +08:00
|
|
|
/// \brief Widen a scalar load.
|
|
|
|
///
|
|
|
|
/// \details \p Widen scalar load for uniform, small type loads from constant
|
|
|
|
// memory / to a full 32-bits and then truncate the input to allow a scalar
|
|
|
|
// load instead of a vector load.
|
|
|
|
//
|
|
|
|
/// \returns True.
|
|
|
|
|
|
|
|
bool canWidenScalarExtLoad(LoadInst &I) const;
|
2016-09-29 04:05:39 +08:00
|
|
|
|
2016-06-24 15:07:55 +08:00
|
|
|
public:
|
|
|
|
static char ID;
|
2017-01-21 01:52:16 +08:00
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
|
2016-07-20 07:16:53 +08:00
|
|
|
|
|
|
|
bool visitFDiv(BinaryOperator &I);
|
|
|
|
|
2016-09-29 04:05:39 +08:00
|
|
|
bool visitInstruction(Instruction &I) { return false; }
|
|
|
|
bool visitBinaryOperator(BinaryOperator &I);
|
2017-07-27 05:07:28 +08:00
|
|
|
bool visitLoadInst(LoadInst &I);
|
2016-09-29 04:05:39 +08:00
|
|
|
bool visitICmpInst(ICmpInst &I);
|
|
|
|
bool visitSelectInst(SelectInst &I);
|
2016-06-24 15:07:55 +08:00
|
|
|
|
2016-10-06 10:20:46 +08:00
|
|
|
bool visitIntrinsicInst(IntrinsicInst &I);
|
|
|
|
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
|
|
|
|
|
2016-06-24 15:07:55 +08:00
|
|
|
bool doInitialization(Module &M) override;
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
|
2016-06-24 15:07:55 +08:00
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.addRequired<DivergenceAnalysis>();
|
|
|
|
AU.setPreservesAll();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2017-01-21 01:52:16 +08:00
|
|
|
} // end anonymous namespace
|
2016-06-24 15:07:55 +08:00
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
|
|
|
|
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
if (T->isIntegerTy())
|
2016-10-07 22:22:58 +08:00
|
|
|
return T->getIntegerBitWidth();
|
|
|
|
return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
|
2016-09-29 04:05:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
|
2016-10-07 22:22:58 +08:00
|
|
|
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
if (T->isIntegerTy())
|
|
|
|
return B.getInt32Ty();
|
|
|
|
return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
|
2016-10-04 02:29:01 +08:00
|
|
|
return I.getOpcode() == Instruction::AShr ||
|
|
|
|
I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
|
2016-09-29 04:05:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
|
|
|
|
return isa<ICmpInst>(I.getOperand(0)) ?
|
|
|
|
cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
|
|
|
|
}
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
|
2017-02-28 06:15:25 +08:00
|
|
|
const IntegerType *IntTy = dyn_cast<IntegerType>(T);
|
|
|
|
if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
|
2016-10-07 22:22:58 +08:00
|
|
|
return true;
|
2017-02-28 06:15:25 +08:00
|
|
|
|
|
|
|
if (const VectorType *VT = dyn_cast<VectorType>(T)) {
|
|
|
|
// TODO: The set of packed operations is more limited, so may want to
|
|
|
|
// promote some anyway.
|
|
|
|
if (ST->hasVOP3PInsts())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return needsPromotionToI32(VT->getElementType());
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
2016-10-07 22:22:58 +08:00
|
|
|
}
|
|
|
|
|
2017-02-02 00:25:23 +08:00
|
|
|
// Return true if the op promoted to i32 should have nsw set.
|
|
|
|
static bool promotedOpIsNSW(const Instruction &I) {
|
|
|
|
switch (I.getOpcode()) {
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::Sub:
|
|
|
|
return true;
|
|
|
|
case Instruction::Mul:
|
|
|
|
return I.hasNoUnsignedWrap();
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return true if the op promoted to i32 should have nuw set.
|
|
|
|
static bool promotedOpIsNUW(const Instruction &I) {
|
|
|
|
switch (I.getOpcode()) {
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::Mul:
|
|
|
|
return true;
|
|
|
|
case Instruction::Sub:
|
|
|
|
return I.hasNoUnsignedWrap();
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-27 05:07:28 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
|
|
|
|
Type *Ty = I.getType();
|
|
|
|
const DataLayout &DL = Mod->getDataLayout();
|
|
|
|
int TySize = DL.getTypeSizeInBits(Ty);
|
|
|
|
unsigned Align = I.getAlignment() ?
|
|
|
|
I.getAlignment() : DL.getABITypeAlignment(Ty);
|
|
|
|
|
|
|
|
return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
|
|
|
|
}
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
|
|
|
|
assert(needsPromotionToI32(I.getType()) &&
|
|
|
|
"I does not need promotion to i32");
|
2016-09-29 04:05:39 +08:00
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
if (I.getOpcode() == Instruction::SDiv ||
|
|
|
|
I.getOpcode() == Instruction::UDiv)
|
2016-09-29 04:05:39 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
Builder.SetCurrentDebugLocation(I.getDebugLoc());
|
|
|
|
|
|
|
|
Type *I32Ty = getI32Ty(Builder, I.getType());
|
|
|
|
Value *ExtOp0 = nullptr;
|
|
|
|
Value *ExtOp1 = nullptr;
|
|
|
|
Value *ExtRes = nullptr;
|
|
|
|
Value *TruncRes = nullptr;
|
|
|
|
|
|
|
|
if (isSigned(I)) {
|
|
|
|
ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
|
|
|
|
ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
|
|
|
|
} else {
|
|
|
|
ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
|
|
|
|
ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
|
|
|
|
}
|
2017-02-02 00:25:23 +08:00
|
|
|
|
|
|
|
ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
|
|
|
|
if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
|
|
|
|
if (promotedOpIsNSW(cast<Instruction>(I)))
|
|
|
|
Inst->setHasNoSignedWrap();
|
|
|
|
|
|
|
|
if (promotedOpIsNUW(cast<Instruction>(I)))
|
|
|
|
Inst->setHasNoUnsignedWrap();
|
|
|
|
|
|
|
|
if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
|
|
|
|
Inst->setIsExact(ExactOp->isExact());
|
|
|
|
}
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
I.replaceAllUsesWith(TruncRes);
|
|
|
|
I.eraseFromParent();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
|
|
|
|
assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
|
|
|
|
"I does not need promotion to i32");
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
Builder.SetCurrentDebugLocation(I.getDebugLoc());
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
|
2016-09-29 04:05:39 +08:00
|
|
|
Value *ExtOp0 = nullptr;
|
|
|
|
Value *ExtOp1 = nullptr;
|
|
|
|
Value *NewICmp = nullptr;
|
|
|
|
|
|
|
|
if (I.isSigned()) {
|
2016-10-07 22:22:58 +08:00
|
|
|
ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
|
|
|
|
ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
|
2016-09-29 04:05:39 +08:00
|
|
|
} else {
|
2016-10-07 22:22:58 +08:00
|
|
|
ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
|
|
|
|
ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
|
2016-09-29 04:05:39 +08:00
|
|
|
}
|
|
|
|
NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
|
|
|
|
|
|
|
|
I.replaceAllUsesWith(NewICmp);
|
|
|
|
I.eraseFromParent();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
|
|
|
|
assert(needsPromotionToI32(I.getType()) &&
|
|
|
|
"I does not need promotion to i32");
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
Builder.SetCurrentDebugLocation(I.getDebugLoc());
|
|
|
|
|
|
|
|
Type *I32Ty = getI32Ty(Builder, I.getType());
|
|
|
|
Value *ExtOp1 = nullptr;
|
|
|
|
Value *ExtOp2 = nullptr;
|
|
|
|
Value *ExtRes = nullptr;
|
|
|
|
Value *TruncRes = nullptr;
|
|
|
|
|
|
|
|
if (isSigned(I)) {
|
|
|
|
ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
|
|
|
|
ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
|
|
|
|
} else {
|
|
|
|
ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
|
|
|
|
ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
|
|
|
|
}
|
|
|
|
ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
|
2016-10-07 22:22:58 +08:00
|
|
|
TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
I.replaceAllUsesWith(TruncRes);
|
|
|
|
I.eraseFromParent();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
|
2016-10-06 10:20:46 +08:00
|
|
|
IntrinsicInst &I) const {
|
2016-10-07 22:22:58 +08:00
|
|
|
assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
|
|
|
|
"I must be bitreverse intrinsic");
|
|
|
|
assert(needsPromotionToI32(I.getType()) &&
|
|
|
|
"I does not need promotion to i32");
|
2016-10-06 10:20:46 +08:00
|
|
|
|
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
Builder.SetCurrentDebugLocation(I.getDebugLoc());
|
|
|
|
|
|
|
|
Type *I32Ty = getI32Ty(Builder, I.getType());
|
|
|
|
Function *I32 =
|
2016-10-07 22:39:53 +08:00
|
|
|
Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
|
2016-10-06 10:20:46 +08:00
|
|
|
Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
|
|
|
|
Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
|
2016-10-07 22:22:58 +08:00
|
|
|
Value *LShrOp =
|
|
|
|
Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
|
2016-10-06 10:20:46 +08:00
|
|
|
Value *TruncRes =
|
2016-10-07 22:22:58 +08:00
|
|
|
Builder.CreateTrunc(LShrOp, I.getType());
|
2016-10-06 10:20:46 +08:00
|
|
|
|
|
|
|
I.replaceAllUsesWith(TruncRes);
|
|
|
|
I.eraseFromParent();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
|
|
|
|
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
|
|
|
|
if (!CNum)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Reciprocal f32 is handled separately without denormals.
|
2016-07-27 07:25:44 +08:00
|
|
|
return UnsafeDiv || CNum->isExactlyValue(+1.0);
|
2016-07-20 07:16:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Insert an intrinsic for fast fdiv for safe math situations where we can
|
|
|
|
// reduce precision. Leave fdiv for situations where the generic node is
|
|
|
|
// expected to be optimized.
|
|
|
|
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
|
|
|
|
Type *Ty = FDiv.getType();
|
|
|
|
|
|
|
|
if (!Ty->getScalarType()->isFloatTy())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
|
|
|
|
if (!FPMath)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
|
|
|
|
float ULP = FPOp->getFPAccuracy();
|
|
|
|
if (ULP < 2.5f)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
FastMathFlags FMF = FPOp->getFastMathFlags();
|
[IR] redefine 'UnsafeAlgebra' / 'reassoc' fast-math-flags and add 'trans' fast-math-flag
As discussed on llvm-dev:
http://lists.llvm.org/pipermail/llvm-dev/2016-November/107104.html
and again more recently:
http://lists.llvm.org/pipermail/llvm-dev/2017-October/118118.html
...this is a step in cleaning up our fast-math-flags implementation in IR to better match
the capabilities of both clang's user-visible flags and the backend's flags for SDNode.
As proposed in the above threads, we're replacing the 'UnsafeAlgebra' bit (which had the
'umbrella' meaning that all flags are set) with a new bit that only applies to algebraic
reassociation - 'AllowReassoc'.
We're also adding a bit to allow approximations for library functions called 'ApproxFunc'
(this was initially proposed as 'libm' or similar).
...and we're out of bits. 7 bits ought to be enough for anyone, right? :) FWIW, I did
look at getting this out of SubclassOptionalData via SubclassData (spacious 16-bits),
but that's apparently already used for other purposes. Also, I don't think we can just
add a field to FPMathOperator because Operator is not intended to be instantiated.
We'll defer movement of FMF to another day.
We keep the 'fast' keyword. I thought about removing that, but seeing IR like this:
%f.fast = fadd reassoc nnan ninf nsz arcp contract afn float %op1, %op2
...made me think we want to keep the shortcut synonym.
Finally, this change is binary incompatible with existing IR as seen in the
compatibility tests. This statement:
"Newer releases can ignore features from older releases, but they cannot miscompile
them. For example, if nsw is ever replaced with something else, dropping it would be
a valid way to upgrade the IR."
( http://llvm.org/docs/DeveloperPolicy.html#ir-backwards-compatibility )
...provides the flexibility we want to make this change without requiring a new IR
version. Ie, we're not loosening the FP strictness of existing IR. At worst, we will
fail to optimize some previously 'fast' code because it's no longer recognized as
'fast'. This should get fixed as we audit/squash all of the uses of 'isFast()'.
Note: an inter-dependent clang commit to use the new API name should closely follow
commit.
Differential Revision: https://reviews.llvm.org/D39304
llvm-svn: 317488
2017-11-07 00:27:15 +08:00
|
|
|
bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
|
2016-07-20 07:16:53 +08:00
|
|
|
FMF.allowReciprocal();
|
2017-07-07 04:34:21 +08:00
|
|
|
|
|
|
|
// With UnsafeDiv node will be optimized to just rcp and mul.
|
|
|
|
if (ST->hasFP32Denormals() || UnsafeDiv)
|
2016-07-20 07:16:53 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
|
|
|
|
Builder.setFastMathFlags(FMF);
|
|
|
|
Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
|
|
|
|
|
2017-03-18 04:41:45 +08:00
|
|
|
Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
|
2016-07-20 07:16:53 +08:00
|
|
|
|
|
|
|
Value *Num = FDiv.getOperand(0);
|
|
|
|
Value *Den = FDiv.getOperand(1);
|
|
|
|
|
|
|
|
Value *NewFDiv = nullptr;
|
|
|
|
|
|
|
|
if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
|
|
|
|
NewFDiv = UndefValue::get(VT);
|
|
|
|
|
|
|
|
// FIXME: Doesn't do the right thing for cases where the vector is partially
|
|
|
|
// constant. This works when the scalarizer pass is run first.
|
|
|
|
for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
|
|
|
|
Value *NumEltI = Builder.CreateExtractElement(Num, I);
|
|
|
|
Value *DenEltI = Builder.CreateExtractElement(Den, I);
|
|
|
|
Value *NewElt;
|
|
|
|
|
|
|
|
if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
|
|
|
|
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
|
|
|
|
} else {
|
|
|
|
NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
|
|
|
|
}
|
|
|
|
|
|
|
|
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (!shouldKeepFDivF32(Num, UnsafeDiv))
|
|
|
|
NewFDiv = Builder.CreateCall(Decl, { Num, Den });
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NewFDiv) {
|
|
|
|
FDiv.replaceAllUsesWith(NewFDiv);
|
|
|
|
NewFDiv->takeName(&FDiv);
|
|
|
|
FDiv.eraseFromParent();
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool hasUnsafeFPMath(const Function &F) {
|
|
|
|
Attribute Attr = F.getFnAttribute("unsafe-fp-math");
|
|
|
|
return Attr.getValueAsString() == "true";
|
|
|
|
}
|
|
|
|
|
2016-09-29 04:05:39 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
|
|
|
|
bool Changed = false;
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
|
|
|
|
DA->isUniform(&I))
|
|
|
|
Changed |= promoteUniformOpToI32(I);
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2017-07-27 05:07:28 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
|
2018-02-10 00:57:57 +08:00
|
|
|
if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
|
|
|
|
I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
|
2017-07-27 05:07:28 +08:00
|
|
|
canWidenScalarExtLoad(I)) {
|
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
Builder.SetCurrentDebugLocation(I.getDebugLoc());
|
|
|
|
|
|
|
|
Type *I32Ty = Builder.getInt32Ty();
|
|
|
|
Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
|
|
|
|
Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
|
|
|
|
Value *WidenLoad = Builder.CreateLoad(BitCast);
|
|
|
|
|
|
|
|
int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
|
|
|
|
Type *IntNTy = Builder.getIntNTy(TySize);
|
|
|
|
Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
|
|
|
|
Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
|
|
|
|
I.replaceAllUsesWith(ValOrig);
|
|
|
|
I.eraseFromParent();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-09-29 04:05:39 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
|
|
|
|
bool Changed = false;
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
|
|
|
|
DA->isUniform(&I))
|
|
|
|
Changed |= promoteUniformOpToI32(I);
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
|
|
|
|
bool Changed = false;
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
|
|
|
|
DA->isUniform(&I))
|
|
|
|
Changed |= promoteUniformOpToI32(I);
|
2016-10-06 10:20:46 +08:00
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
|
|
|
|
switch (I.getIntrinsicID()) {
|
|
|
|
case Intrinsic::bitreverse:
|
|
|
|
return visitBitreverseIntrinsicInst(I);
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
|
|
|
|
bool Changed = false;
|
|
|
|
|
2016-10-07 22:22:58 +08:00
|
|
|
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
|
|
|
|
DA->isUniform(&I))
|
|
|
|
Changed |= promoteUniformBitreverseToI32(I);
|
2016-09-29 04:05:39 +08:00
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2016-06-24 15:07:55 +08:00
|
|
|
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
|
2016-07-20 07:16:53 +08:00
|
|
|
Mod = &M;
|
2016-06-24 15:07:55 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
|
2017-05-19 01:21:13 +08:00
|
|
|
if (skipFunction(F))
|
2016-06-24 15:07:55 +08:00
|
|
|
return false;
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
|
|
|
|
if (!TPC)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const TargetMachine &TM = TPC->getTM<TargetMachine>();
|
|
|
|
ST = &TM.getSubtarget<SISubtarget>(F);
|
2016-06-24 15:07:55 +08:00
|
|
|
DA = &getAnalysis<DivergenceAnalysis>();
|
2016-07-20 07:16:53 +08:00
|
|
|
HasUnsafeFPMath = hasUnsafeFPMath(F);
|
2016-06-24 15:07:55 +08:00
|
|
|
|
2016-07-20 07:16:53 +08:00
|
|
|
bool MadeChange = false;
|
|
|
|
|
|
|
|
for (BasicBlock &BB : F) {
|
|
|
|
BasicBlock::iterator Next;
|
|
|
|
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
|
|
|
|
Next = std::next(I);
|
|
|
|
MadeChange |= visit(*I);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return MadeChange;
|
2016-06-24 15:07:55 +08:00
|
|
|
}
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
|
2016-06-24 15:07:55 +08:00
|
|
|
"AMDGPU IR optimizations", false, false)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
|
2017-05-19 01:21:13 +08:00
|
|
|
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
|
|
|
|
false, false)
|
2016-06-24 15:07:55 +08:00
|
|
|
|
|
|
|
char AMDGPUCodeGenPrepare::ID = 0;
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
|
|
|
|
return new AMDGPUCodeGenPrepare();
|
2016-06-24 15:07:55 +08:00
|
|
|
}
|