forked from OSchip/llvm-project
1005 lines
34 KiB
C++
1005 lines
34 KiB
C++
//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// \file
|
|
// This file implements a TargetTransformInfo analysis pass specific to the
|
|
// AMDGPU target machine. It uses the target's detailed information to provide
|
|
// more precise answers to certain TTI queries, while letting the target
|
|
// independent and default TTI implementations handle the rest.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPUTargetTransformInfo.h"
|
|
#include "llvm/Support/KnownBits.h"
|
|
#include "llvm/Transforms/InstCombine/InstCombiner.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "AMDGPUtti"
|
|
|
|
namespace {
|
|
|
|
struct AMDGPUImageDMaskIntrinsic {
|
|
unsigned Intr;
|
|
};
|
|
|
|
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
|
|
#include "InstCombineTables.inc"
|
|
|
|
} // end anonymous namespace
|
|
|
|
// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
|
|
//
|
|
// A single NaN input is folded to minnum, so we rely on that folding for
|
|
// handling NaNs.
|
|
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
|
|
const APFloat &Src2) {
|
|
APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
|
|
|
|
APFloat::cmpResult Cmp0 = Max3.compare(Src0);
|
|
assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
|
|
if (Cmp0 == APFloat::cmpEqual)
|
|
return maxnum(Src1, Src2);
|
|
|
|
APFloat::cmpResult Cmp1 = Max3.compare(Src1);
|
|
assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
|
|
if (Cmp1 == APFloat::cmpEqual)
|
|
return maxnum(Src0, Src2);
|
|
|
|
return maxnum(Src0, Src1);
|
|
}
|
|
|
|
// Check if a value can be converted to a 16-bit value without losing
|
|
// precision.
|
|
static bool canSafelyConvertTo16Bit(Value &V) {
|
|
Type *VTy = V.getType();
|
|
if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
|
|
// The value is already 16-bit, so we don't want to convert to 16-bit again!
|
|
return false;
|
|
}
|
|
if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
|
|
// We need to check that if we cast the index down to a half, we do not lose
|
|
// precision.
|
|
APFloat FloatValue(ConstFloat->getValueAPF());
|
|
bool LosesInfo = true;
|
|
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
|
|
return !LosesInfo;
|
|
}
|
|
Value *CastSrc;
|
|
if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
|
|
match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
|
|
match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
|
|
Type *CastSrcTy = CastSrc->getType();
|
|
if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Convert a value to 16-bit.
|
|
static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
|
|
Type *VTy = V.getType();
|
|
if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
|
|
return cast<Instruction>(&V)->getOperand(0);
|
|
if (VTy->isIntegerTy())
|
|
return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
|
|
if (VTy->isFloatingPointTy())
|
|
return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
|
|
|
|
llvm_unreachable("Should never be called!");
|
|
}
|
|
|
|
static Optional<Instruction *>
|
|
simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
|
|
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
|
|
IntrinsicInst &II, InstCombiner &IC) {
|
|
if (!ST->hasA16() && !ST->hasG16())
|
|
return None;
|
|
|
|
bool FloatCoord = false;
|
|
// true means derivatives can be converted to 16 bit, coordinates not
|
|
bool OnlyDerivatives = false;
|
|
|
|
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
|
|
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
|
|
Value *Coord = II.getOperand(OperandIndex);
|
|
// If the values are not derived from 16-bit values, we cannot optimize.
|
|
if (!canSafelyConvertTo16Bit(*Coord)) {
|
|
if (OperandIndex < ImageDimIntr->CoordStart ||
|
|
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
|
|
return None;
|
|
}
|
|
// All gradients can be converted, so convert only them
|
|
OnlyDerivatives = true;
|
|
break;
|
|
}
|
|
|
|
assert(OperandIndex == ImageDimIntr->GradientStart ||
|
|
FloatCoord == Coord->getType()->isFloatingPointTy());
|
|
FloatCoord = Coord->getType()->isFloatingPointTy();
|
|
}
|
|
|
|
if (OnlyDerivatives) {
|
|
if (!ST->hasG16())
|
|
return None;
|
|
} else {
|
|
if (!ST->hasA16())
|
|
OnlyDerivatives = true; // Only supports G16
|
|
}
|
|
|
|
Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
|
|
: Type::getInt16Ty(II.getContext());
|
|
|
|
SmallVector<Type *, 4> ArgTys;
|
|
if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
|
|
return None;
|
|
|
|
ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
|
|
if (!OnlyDerivatives)
|
|
ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
|
|
Function *I =
|
|
Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
|
|
|
|
SmallVector<Value *, 8> Args(II.arg_operands());
|
|
|
|
unsigned EndIndex =
|
|
OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
|
|
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
|
|
OperandIndex < EndIndex; OperandIndex++) {
|
|
Args[OperandIndex] =
|
|
convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
|
|
}
|
|
|
|
CallInst *NewCall = IC.Builder.CreateCall(I, Args);
|
|
NewCall->takeName(&II);
|
|
NewCall->copyMetadata(II);
|
|
NewCall->copyFastMathFlags(&II);
|
|
return IC.replaceInstUsesWith(II, NewCall);
|
|
}
|
|
|
|
Optional<Instruction *>
|
|
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
|
|
Intrinsic::ID IID = II.getIntrinsicID();
|
|
switch (IID) {
|
|
case Intrinsic::amdgcn_rcp: {
|
|
Value *Src = II.getArgOperand(0);
|
|
|
|
// TODO: Move to ConstantFolding/InstSimplify?
|
|
if (isa<UndefValue>(Src)) {
|
|
Type *Ty = II.getType();
|
|
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
|
|
return IC.replaceInstUsesWith(II, QNaN);
|
|
}
|
|
|
|
if (II.isStrictFP())
|
|
break;
|
|
|
|
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
|
|
const APFloat &ArgVal = C->getValueAPF();
|
|
APFloat Val(ArgVal.getSemantics(), 1);
|
|
Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
|
|
|
|
// This is more precise than the instruction may give.
|
|
//
|
|
// TODO: The instruction always flushes denormal results (except for f16),
|
|
// should this also?
|
|
return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_rsq: {
|
|
Value *Src = II.getArgOperand(0);
|
|
|
|
// TODO: Move to ConstantFolding/InstSimplify?
|
|
if (isa<UndefValue>(Src)) {
|
|
Type *Ty = II.getType();
|
|
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
|
|
return IC.replaceInstUsesWith(II, QNaN);
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_frexp_mant:
|
|
case Intrinsic::amdgcn_frexp_exp: {
|
|
Value *Src = II.getArgOperand(0);
|
|
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
|
|
int Exp;
|
|
APFloat Significand =
|
|
frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
|
|
|
|
if (IID == Intrinsic::amdgcn_frexp_mant) {
|
|
return IC.replaceInstUsesWith(
|
|
II, ConstantFP::get(II.getContext(), Significand));
|
|
}
|
|
|
|
// Match instruction special case behavior.
|
|
if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
|
|
Exp = 0;
|
|
|
|
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
|
|
}
|
|
|
|
if (isa<UndefValue>(Src)) {
|
|
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_class: {
|
|
enum {
|
|
S_NAN = 1 << 0, // Signaling NaN
|
|
Q_NAN = 1 << 1, // Quiet NaN
|
|
N_INFINITY = 1 << 2, // Negative infinity
|
|
N_NORMAL = 1 << 3, // Negative normal
|
|
N_SUBNORMAL = 1 << 4, // Negative subnormal
|
|
N_ZERO = 1 << 5, // Negative zero
|
|
P_ZERO = 1 << 6, // Positive zero
|
|
P_SUBNORMAL = 1 << 7, // Positive subnormal
|
|
P_NORMAL = 1 << 8, // Positive normal
|
|
P_INFINITY = 1 << 9 // Positive infinity
|
|
};
|
|
|
|
const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
|
|
N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
|
|
P_NORMAL | P_INFINITY;
|
|
|
|
Value *Src0 = II.getArgOperand(0);
|
|
Value *Src1 = II.getArgOperand(1);
|
|
const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
|
|
if (!CMask) {
|
|
if (isa<UndefValue>(Src0)) {
|
|
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
|
|
}
|
|
|
|
if (isa<UndefValue>(Src1)) {
|
|
return IC.replaceInstUsesWith(II,
|
|
ConstantInt::get(II.getType(), false));
|
|
}
|
|
break;
|
|
}
|
|
|
|
uint32_t Mask = CMask->getZExtValue();
|
|
|
|
// If all tests are made, it doesn't matter what the value is.
|
|
if ((Mask & FullMask) == FullMask) {
|
|
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
|
|
}
|
|
|
|
if ((Mask & FullMask) == 0) {
|
|
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
|
|
}
|
|
|
|
if (Mask == (S_NAN | Q_NAN)) {
|
|
// Equivalent of isnan. Replace with standard fcmp.
|
|
Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
|
|
FCmp->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, FCmp);
|
|
}
|
|
|
|
if (Mask == (N_ZERO | P_ZERO)) {
|
|
// Equivalent of == 0.
|
|
Value *FCmp =
|
|
IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
|
|
|
|
FCmp->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, FCmp);
|
|
}
|
|
|
|
// fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
|
|
if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
|
|
isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
|
|
return IC.replaceOperand(
|
|
II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
|
|
}
|
|
|
|
const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
|
|
if (!CVal) {
|
|
if (isa<UndefValue>(Src0)) {
|
|
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
|
|
}
|
|
|
|
// Clamp mask to used bits
|
|
if ((Mask & FullMask) != Mask) {
|
|
CallInst *NewCall = IC.Builder.CreateCall(
|
|
II.getCalledFunction(),
|
|
{Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
|
|
|
|
NewCall->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, NewCall);
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
const APFloat &Val = CVal->getValueAPF();
|
|
|
|
bool Result =
|
|
((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
|
|
((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
|
|
((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
|
|
((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
|
|
((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
|
|
((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
|
|
((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
|
|
((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
|
|
((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
|
|
((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
|
|
|
|
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
|
|
}
|
|
case Intrinsic::amdgcn_cvt_pkrtz: {
|
|
Value *Src0 = II.getArgOperand(0);
|
|
Value *Src1 = II.getArgOperand(1);
|
|
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
|
|
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
|
|
const fltSemantics &HalfSem =
|
|
II.getType()->getScalarType()->getFltSemantics();
|
|
bool LosesInfo;
|
|
APFloat Val0 = C0->getValueAPF();
|
|
APFloat Val1 = C1->getValueAPF();
|
|
Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
|
|
Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
|
|
|
|
Constant *Folded =
|
|
ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
|
|
ConstantFP::get(II.getContext(), Val1)});
|
|
return IC.replaceInstUsesWith(II, Folded);
|
|
}
|
|
}
|
|
|
|
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
|
|
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_cvt_pknorm_i16:
|
|
case Intrinsic::amdgcn_cvt_pknorm_u16:
|
|
case Intrinsic::amdgcn_cvt_pk_i16:
|
|
case Intrinsic::amdgcn_cvt_pk_u16: {
|
|
Value *Src0 = II.getArgOperand(0);
|
|
Value *Src1 = II.getArgOperand(1);
|
|
|
|
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
|
|
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_ubfe:
|
|
case Intrinsic::amdgcn_sbfe: {
|
|
// Decompose simple cases into standard shifts.
|
|
Value *Src = II.getArgOperand(0);
|
|
if (isa<UndefValue>(Src)) {
|
|
return IC.replaceInstUsesWith(II, Src);
|
|
}
|
|
|
|
unsigned Width;
|
|
Type *Ty = II.getType();
|
|
unsigned IntSize = Ty->getIntegerBitWidth();
|
|
|
|
ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
|
|
if (CWidth) {
|
|
Width = CWidth->getZExtValue();
|
|
if ((Width & (IntSize - 1)) == 0) {
|
|
return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
|
|
}
|
|
|
|
// Hardware ignores high bits, so remove those.
|
|
if (Width >= IntSize) {
|
|
return IC.replaceOperand(
|
|
II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
|
|
}
|
|
}
|
|
|
|
unsigned Offset;
|
|
ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
|
|
if (COffset) {
|
|
Offset = COffset->getZExtValue();
|
|
if (Offset >= IntSize) {
|
|
return IC.replaceOperand(
|
|
II, 1,
|
|
ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
|
|
}
|
|
}
|
|
|
|
bool Signed = IID == Intrinsic::amdgcn_sbfe;
|
|
|
|
if (!CWidth || !COffset)
|
|
break;
|
|
|
|
// The case of Width == 0 is handled above, which makes this tranformation
|
|
// safe. If Width == 0, then the ashr and lshr instructions become poison
|
|
// value since the shift amount would be equal to the bit size.
|
|
assert(Width != 0);
|
|
|
|
// TODO: This allows folding to undef when the hardware has specific
|
|
// behavior?
|
|
if (Offset + Width < IntSize) {
|
|
Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
|
|
Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
|
|
: IC.Builder.CreateLShr(Shl, IntSize - Width);
|
|
RightShift->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, RightShift);
|
|
}
|
|
|
|
Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
|
|
: IC.Builder.CreateLShr(Src, Offset);
|
|
|
|
RightShift->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, RightShift);
|
|
}
|
|
case Intrinsic::amdgcn_exp:
|
|
case Intrinsic::amdgcn_exp_compr: {
|
|
ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
|
|
unsigned EnBits = En->getZExtValue();
|
|
if (EnBits == 0xf)
|
|
break; // All inputs enabled.
|
|
|
|
bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
|
|
bool Changed = false;
|
|
for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
|
|
if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
|
|
(IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
|
|
Value *Src = II.getArgOperand(I + 2);
|
|
if (!isa<UndefValue>(Src)) {
|
|
IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (Changed) {
|
|
return &II;
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_fmed3: {
|
|
// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
|
|
// for the shader.
|
|
|
|
Value *Src0 = II.getArgOperand(0);
|
|
Value *Src1 = II.getArgOperand(1);
|
|
Value *Src2 = II.getArgOperand(2);
|
|
|
|
// Checking for NaN before canonicalization provides better fidelity when
|
|
// mapping other operations onto fmed3 since the order of operands is
|
|
// unchanged.
|
|
CallInst *NewCall = nullptr;
|
|
if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
|
|
NewCall = IC.Builder.CreateMinNum(Src1, Src2);
|
|
} else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
|
|
NewCall = IC.Builder.CreateMinNum(Src0, Src2);
|
|
} else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
|
|
NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
|
|
}
|
|
|
|
if (NewCall) {
|
|
NewCall->copyFastMathFlags(&II);
|
|
NewCall->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, NewCall);
|
|
}
|
|
|
|
bool Swap = false;
|
|
// Canonicalize constants to RHS operands.
|
|
//
|
|
// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
|
|
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
|
|
std::swap(Src0, Src1);
|
|
Swap = true;
|
|
}
|
|
|
|
if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
|
|
std::swap(Src1, Src2);
|
|
Swap = true;
|
|
}
|
|
|
|
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
|
|
std::swap(Src0, Src1);
|
|
Swap = true;
|
|
}
|
|
|
|
if (Swap) {
|
|
II.setArgOperand(0, Src0);
|
|
II.setArgOperand(1, Src1);
|
|
II.setArgOperand(2, Src2);
|
|
return &II;
|
|
}
|
|
|
|
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
|
|
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
|
|
if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
|
|
APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
|
|
C2->getValueAPF());
|
|
return IC.replaceInstUsesWith(
|
|
II, ConstantFP::get(IC.Builder.getContext(), Result));
|
|
}
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_icmp:
|
|
case Intrinsic::amdgcn_fcmp: {
|
|
const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
|
|
// Guard against invalid arguments.
|
|
int64_t CCVal = CC->getZExtValue();
|
|
bool IsInteger = IID == Intrinsic::amdgcn_icmp;
|
|
if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
|
|
CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
|
|
(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
|
|
CCVal > CmpInst::LAST_FCMP_PREDICATE)))
|
|
break;
|
|
|
|
Value *Src0 = II.getArgOperand(0);
|
|
Value *Src1 = II.getArgOperand(1);
|
|
|
|
if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
|
|
if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
|
|
Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
|
|
if (CCmp->isNullValue()) {
|
|
return IC.replaceInstUsesWith(
|
|
II, ConstantExpr::getSExt(CCmp, II.getType()));
|
|
}
|
|
|
|
// The result of V_ICMP/V_FCMP assembly instructions (which this
|
|
// intrinsic exposes) is one bit per thread, masked with the EXEC
|
|
// register (which contains the bitmask of live threads). So a
|
|
// comparison that always returns true is the same as a read of the
|
|
// EXEC register.
|
|
Function *NewF = Intrinsic::getDeclaration(
|
|
II.getModule(), Intrinsic::read_register, II.getType());
|
|
Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
|
|
MDNode *MD = MDNode::get(II.getContext(), MDArgs);
|
|
Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
|
|
CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
|
|
NewCall->addAttribute(AttributeList::FunctionIndex,
|
|
Attribute::Convergent);
|
|
NewCall->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, NewCall);
|
|
}
|
|
|
|
// Canonicalize constants to RHS.
|
|
CmpInst::Predicate SwapPred =
|
|
CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
|
|
II.setArgOperand(0, Src1);
|
|
II.setArgOperand(1, Src0);
|
|
II.setArgOperand(
|
|
2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
|
|
return &II;
|
|
}
|
|
|
|
if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
|
|
break;
|
|
|
|
// Canonicalize compare eq with true value to compare != 0
|
|
// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
|
|
// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
|
|
// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
|
|
// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
|
|
Value *ExtSrc;
|
|
if (CCVal == CmpInst::ICMP_EQ &&
|
|
((match(Src1, PatternMatch::m_One()) &&
|
|
match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
|
|
(match(Src1, PatternMatch::m_AllOnes()) &&
|
|
match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
|
|
ExtSrc->getType()->isIntegerTy(1)) {
|
|
IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
|
|
IC.replaceOperand(II, 2,
|
|
ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
|
|
return &II;
|
|
}
|
|
|
|
CmpInst::Predicate SrcPred;
|
|
Value *SrcLHS;
|
|
Value *SrcRHS;
|
|
|
|
// Fold compare eq/ne with 0 from a compare result as the predicate to the
|
|
// intrinsic. The typical use is a wave vote function in the library, which
|
|
// will be fed from a user code condition compared with 0. Fold in the
|
|
// redundant compare.
|
|
|
|
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
|
|
// -> llvm.amdgcn.[if]cmp(a, b, pred)
|
|
//
|
|
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
|
|
// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
|
|
if (match(Src1, PatternMatch::m_Zero()) &&
|
|
match(Src0, PatternMatch::m_ZExtOrSExt(
|
|
m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
|
|
PatternMatch::m_Value(SrcRHS))))) {
|
|
if (CCVal == CmpInst::ICMP_EQ)
|
|
SrcPred = CmpInst::getInversePredicate(SrcPred);
|
|
|
|
Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
|
|
? Intrinsic::amdgcn_fcmp
|
|
: Intrinsic::amdgcn_icmp;
|
|
|
|
Type *Ty = SrcLHS->getType();
|
|
if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
|
|
// Promote to next legal integer type.
|
|
unsigned Width = CmpType->getBitWidth();
|
|
unsigned NewWidth = Width;
|
|
|
|
// Don't do anything for i1 comparisons.
|
|
if (Width == 1)
|
|
break;
|
|
|
|
if (Width <= 16)
|
|
NewWidth = 16;
|
|
else if (Width <= 32)
|
|
NewWidth = 32;
|
|
else if (Width <= 64)
|
|
NewWidth = 64;
|
|
else if (Width > 64)
|
|
break; // Can't handle this.
|
|
|
|
if (Width != NewWidth) {
|
|
IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
|
|
if (CmpInst::isSigned(SrcPred)) {
|
|
SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
|
|
SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
|
|
} else {
|
|
SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
|
|
SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
|
|
}
|
|
}
|
|
} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
|
|
break;
|
|
|
|
Function *NewF = Intrinsic::getDeclaration(
|
|
II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
|
|
Value *Args[] = {SrcLHS, SrcRHS,
|
|
ConstantInt::get(CC->getType(), SrcPred)};
|
|
CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
|
|
NewCall->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, NewCall);
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_ballot: {
|
|
if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
|
|
if (Src->isZero()) {
|
|
// amdgcn.ballot(i1 0) is zero.
|
|
return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
|
|
}
|
|
|
|
if (Src->isOne()) {
|
|
// amdgcn.ballot(i1 1) is exec.
|
|
const char *RegName = "exec";
|
|
if (II.getType()->isIntegerTy(32))
|
|
RegName = "exec_lo";
|
|
else if (!II.getType()->isIntegerTy(64))
|
|
break;
|
|
|
|
Function *NewF = Intrinsic::getDeclaration(
|
|
II.getModule(), Intrinsic::read_register, II.getType());
|
|
Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
|
|
MDNode *MD = MDNode::get(II.getContext(), MDArgs);
|
|
Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
|
|
CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
|
|
NewCall->addAttribute(AttributeList::FunctionIndex,
|
|
Attribute::Convergent);
|
|
NewCall->takeName(&II);
|
|
return IC.replaceInstUsesWith(II, NewCall);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_wqm_vote: {
|
|
// wqm_vote is identity when the argument is constant.
|
|
if (!isa<Constant>(II.getArgOperand(0)))
|
|
break;
|
|
|
|
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
|
|
}
|
|
case Intrinsic::amdgcn_kill: {
|
|
const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
|
|
if (!C || !C->getZExtValue())
|
|
break;
|
|
|
|
// amdgcn.kill(i1 1) is a no-op
|
|
return IC.eraseInstFromFunction(II);
|
|
}
|
|
case Intrinsic::amdgcn_update_dpp: {
|
|
Value *Old = II.getArgOperand(0);
|
|
|
|
auto *BC = cast<ConstantInt>(II.getArgOperand(5));
|
|
auto *RM = cast<ConstantInt>(II.getArgOperand(3));
|
|
auto *BM = cast<ConstantInt>(II.getArgOperand(4));
|
|
if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
|
|
BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
|
|
break;
|
|
|
|
// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
|
|
return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
|
|
}
|
|
case Intrinsic::amdgcn_permlane16:
|
|
case Intrinsic::amdgcn_permlanex16: {
|
|
// Discard vdst_in if it's not going to be read.
|
|
Value *VDstIn = II.getArgOperand(0);
|
|
if (isa<UndefValue>(VDstIn))
|
|
break;
|
|
|
|
ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
|
|
ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
|
|
if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
|
|
break;
|
|
|
|
return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
|
|
}
|
|
case Intrinsic::amdgcn_readfirstlane:
|
|
case Intrinsic::amdgcn_readlane: {
|
|
// A constant value is trivially uniform.
|
|
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
|
|
return IC.replaceInstUsesWith(II, C);
|
|
}
|
|
|
|
// The rest of these may not be safe if the exec may not be the same between
|
|
// the def and use.
|
|
Value *Src = II.getArgOperand(0);
|
|
Instruction *SrcInst = dyn_cast<Instruction>(Src);
|
|
if (SrcInst && SrcInst->getParent() != II.getParent())
|
|
break;
|
|
|
|
// readfirstlane (readfirstlane x) -> readfirstlane x
|
|
// readlane (readfirstlane x), y -> readfirstlane x
|
|
if (match(Src,
|
|
PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
|
|
return IC.replaceInstUsesWith(II, Src);
|
|
}
|
|
|
|
if (IID == Intrinsic::amdgcn_readfirstlane) {
|
|
// readfirstlane (readlane x, y) -> readlane x, y
|
|
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
|
|
return IC.replaceInstUsesWith(II, Src);
|
|
}
|
|
} else {
|
|
// readlane (readlane x, y), y -> readlane x, y
|
|
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
|
|
PatternMatch::m_Value(),
|
|
PatternMatch::m_Specific(II.getArgOperand(1))))) {
|
|
return IC.replaceInstUsesWith(II, Src);
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
case Intrinsic::amdgcn_ldexp: {
|
|
// FIXME: This doesn't introduce new instructions and belongs in
|
|
// InstructionSimplify.
|
|
Type *Ty = II.getType();
|
|
Value *Op0 = II.getArgOperand(0);
|
|
Value *Op1 = II.getArgOperand(1);
|
|
|
|
// Folding undef to qnan is safe regardless of the FP mode.
|
|
if (isa<UndefValue>(Op0)) {
|
|
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
|
|
return IC.replaceInstUsesWith(II, QNaN);
|
|
}
|
|
|
|
const APFloat *C = nullptr;
|
|
match(Op0, PatternMatch::m_APFloat(C));
|
|
|
|
// FIXME: Should flush denorms depending on FP mode, but that's ignored
|
|
// everywhere else.
|
|
//
|
|
// These cases should be safe, even with strictfp.
|
|
// ldexp(0.0, x) -> 0.0
|
|
// ldexp(-0.0, x) -> -0.0
|
|
// ldexp(inf, x) -> inf
|
|
// ldexp(-inf, x) -> -inf
|
|
if (C && (C->isZero() || C->isInfinity())) {
|
|
return IC.replaceInstUsesWith(II, Op0);
|
|
}
|
|
|
|
// With strictfp, be more careful about possibly needing to flush denormals
|
|
// or not, and snan behavior depends on ieee_mode.
|
|
if (II.isStrictFP())
|
|
break;
|
|
|
|
if (C && C->isNaN()) {
|
|
// FIXME: We just need to make the nan quiet here, but that's unavailable
|
|
// on APFloat, only IEEEfloat
|
|
auto *Quieted =
|
|
ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
|
|
return IC.replaceInstUsesWith(II, Quieted);
|
|
}
|
|
|
|
// ldexp(x, 0) -> x
|
|
// ldexp(x, undef) -> x
|
|
if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
|
|
return IC.replaceInstUsesWith(II, Op0);
|
|
}
|
|
|
|
break;
|
|
}
|
|
default: {
|
|
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
|
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
|
|
return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
|
|
}
|
|
}
|
|
}
|
|
return None;
|
|
}
|
|
|
|
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
|
|
///
|
|
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
|
|
/// struct returns.
|
|
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
|
|
IntrinsicInst &II,
|
|
APInt DemandedElts,
|
|
int DMaskIdx = -1) {
|
|
|
|
auto *IIVTy = cast<FixedVectorType>(II.getType());
|
|
unsigned VWidth = IIVTy->getNumElements();
|
|
if (VWidth == 1)
|
|
return nullptr;
|
|
|
|
IRBuilderBase::InsertPointGuard Guard(IC.Builder);
|
|
IC.Builder.SetInsertPoint(&II);
|
|
|
|
// Assume the arguments are unchanged and later override them, if needed.
|
|
SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end());
|
|
|
|
if (DMaskIdx < 0) {
|
|
// Buffer case.
|
|
|
|
const unsigned ActiveBits = DemandedElts.getActiveBits();
|
|
const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
|
|
|
|
// Start assuming the prefix of elements is demanded, but possibly clear
|
|
// some other bits if there are trailing zeros (unused components at front)
|
|
// and update offset.
|
|
DemandedElts = (1 << ActiveBits) - 1;
|
|
|
|
if (UnusedComponentsAtFront > 0) {
|
|
static const unsigned InvalidOffsetIdx = 0xf;
|
|
|
|
unsigned OffsetIdx;
|
|
switch (II.getIntrinsicID()) {
|
|
case Intrinsic::amdgcn_raw_buffer_load:
|
|
OffsetIdx = 1;
|
|
break;
|
|
case Intrinsic::amdgcn_s_buffer_load:
|
|
// If resulting type is vec3, there is no point in trimming the
|
|
// load with updated offset, as the vec3 would most likely be widened to
|
|
// vec4 anyway during lowering.
|
|
if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
|
|
OffsetIdx = InvalidOffsetIdx;
|
|
else
|
|
OffsetIdx = 1;
|
|
break;
|
|
case Intrinsic::amdgcn_struct_buffer_load:
|
|
OffsetIdx = 2;
|
|
break;
|
|
default:
|
|
// TODO: handle tbuffer* intrinsics.
|
|
OffsetIdx = InvalidOffsetIdx;
|
|
break;
|
|
}
|
|
|
|
if (OffsetIdx != InvalidOffsetIdx) {
|
|
// Clear demanded bits and update the offset.
|
|
DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
|
|
auto *Offset = II.getArgOperand(OffsetIdx);
|
|
unsigned SingleComponentSizeInBits =
|
|
IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
|
|
unsigned OffsetAdd =
|
|
UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
|
|
auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
|
|
Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
|
|
}
|
|
}
|
|
} else {
|
|
// Image case.
|
|
|
|
ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
|
|
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
|
|
|
|
// Mask off values that are undefined because the dmask doesn't cover them
|
|
DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
|
|
|
|
unsigned NewDMaskVal = 0;
|
|
unsigned OrigLoadIdx = 0;
|
|
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
|
|
const unsigned Bit = 1 << SrcIdx;
|
|
if (!!(DMaskVal & Bit)) {
|
|
if (!!DemandedElts[OrigLoadIdx])
|
|
NewDMaskVal |= Bit;
|
|
OrigLoadIdx++;
|
|
}
|
|
}
|
|
|
|
if (DMaskVal != NewDMaskVal)
|
|
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
|
|
}
|
|
|
|
unsigned NewNumElts = DemandedElts.countPopulation();
|
|
if (!NewNumElts)
|
|
return UndefValue::get(II.getType());
|
|
|
|
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
|
|
if (DMaskIdx >= 0)
|
|
II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
|
|
return nullptr;
|
|
}
|
|
|
|
// Validate function argument and return types, extracting overloaded types
|
|
// along the way.
|
|
SmallVector<Type *, 6> OverloadTys;
|
|
if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
|
|
return nullptr;
|
|
|
|
Module *M = II.getParent()->getParent()->getParent();
|
|
Type *EltTy = IIVTy->getElementType();
|
|
Type *NewTy =
|
|
(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
|
|
|
|
OverloadTys[0] = NewTy;
|
|
Function *NewIntrin =
|
|
Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
|
|
|
|
CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
|
|
NewCall->takeName(&II);
|
|
NewCall->copyMetadata(II);
|
|
|
|
if (NewNumElts == 1) {
|
|
return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
|
|
NewCall,
|
|
DemandedElts.countTrailingZeros());
|
|
}
|
|
|
|
SmallVector<int, 8> EltMask;
|
|
unsigned NewLoadIdx = 0;
|
|
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
|
|
if (!!DemandedElts[OrigLoadIdx])
|
|
EltMask.push_back(NewLoadIdx++);
|
|
else
|
|
EltMask.push_back(NewNumElts);
|
|
}
|
|
|
|
Value *Shuffle =
|
|
IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
|
|
|
|
return Shuffle;
|
|
}
|
|
|
|
Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
|
|
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
|
|
APInt &UndefElts2, APInt &UndefElts3,
|
|
std::function<void(Instruction *, unsigned, APInt, APInt &)>
|
|
SimplifyAndSetOp) const {
|
|
switch (II.getIntrinsicID()) {
|
|
case Intrinsic::amdgcn_buffer_load:
|
|
case Intrinsic::amdgcn_buffer_load_format:
|
|
case Intrinsic::amdgcn_raw_buffer_load:
|
|
case Intrinsic::amdgcn_raw_buffer_load_format:
|
|
case Intrinsic::amdgcn_raw_tbuffer_load:
|
|
case Intrinsic::amdgcn_s_buffer_load:
|
|
case Intrinsic::amdgcn_struct_buffer_load:
|
|
case Intrinsic::amdgcn_struct_buffer_load_format:
|
|
case Intrinsic::amdgcn_struct_tbuffer_load:
|
|
case Intrinsic::amdgcn_tbuffer_load:
|
|
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
|
|
default: {
|
|
if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
|
|
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return None;
|
|
}
|