forked from OSchip/llvm-project
InstCombine/AMDGPU: Add dimension-aware image intrinsics to SimplifyDemanded
Summary: Use the expanded features of the TableGen generic tables to avoid manually adding the combinatorially exploded set of intrinsics. The getAMDGPUImageDimIntrinsic lookup function is early-out, i.e. non-AMDGPU intrinsics will never look at the underlying table. Use a generic approach for getting the new intrinsic overload to keep the code simple, and make the image dmask handling more generic: - handle non-sampler image loads - handle the case where the set of demanded elements is not a prefix There is some overlap between this code and an optimization that happens in the backend during code generation. They currently complement each other: - only the codegen optimization can generate vec3 loads - only the InstCombine optimization can handle D16 The InstCombine optimization also likely covers more cases since the codegen optimization is fairly ad-hoc. Ideally, we'll remove the optimization in codegen once the infrastructure for vec3 is in place (which will probably take a long time). Modify the test cases to use dimension-aware intrinsics. This makes it easier to see that the test coverage for the new intrinsics is equivalent, and the old style intrinsics will be removed in a follow-up commit anyway. Change-Id: I4b91ea661413d13004956fe4ef7d13d41b8ce3ad Reviewers: arsenm, rampitec, majnemer Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D48165 llvm-svn: 335230
This commit is contained in:
parent
1045928aab
commit
b29ee70122
|
@ -803,9 +803,15 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
|
||||||
!if(P_.IsAtomic, 0, 1)), 1> {
|
!if(P_.IsAtomic, 0, 1)), 1> {
|
||||||
AMDGPUDimProfile P = P_;
|
AMDGPUDimProfile P = P_;
|
||||||
|
|
||||||
|
AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
|
||||||
|
|
||||||
let TargetPrefix = "amdgcn";
|
let TargetPrefix = "amdgcn";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Marker class for intrinsics with a DMask that determines the returned
|
||||||
|
// channels.
|
||||||
|
class AMDGPUImageDMaskIntrinsic;
|
||||||
|
|
||||||
defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
|
defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -839,10 +845,14 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
defm int_amdgcn_image_load : AMDGPUImageDimIntrinsicsAll<
|
defm int_amdgcn_image_load
|
||||||
"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand]>;
|
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
|
||||||
defm int_amdgcn_image_load_mip : AMDGPUImageDimIntrinsicsNoMsaa<
|
[SDNPMemOperand]>,
|
||||||
"LOAD_MIP", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand], 1>;
|
AMDGPUImageDMaskIntrinsic;
|
||||||
|
defm int_amdgcn_image_load_mip
|
||||||
|
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
|
||||||
|
[IntrReadMem], [SDNPMemOperand], 1>,
|
||||||
|
AMDGPUImageDMaskIntrinsic;
|
||||||
|
|
||||||
defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
|
defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
|
||||||
"STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
|
"STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
|
||||||
|
@ -866,18 +876,22 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
|
||||||
}
|
}
|
||||||
|
|
||||||
foreach sample = AMDGPUSampleVariants in {
|
foreach sample = AMDGPUSampleVariants in {
|
||||||
defm int_amdgcn_image_sample # sample.LowerCaseMod :
|
defm int_amdgcn_image_sample # sample.LowerCaseMod
|
||||||
AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>;
|
: AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
|
||||||
|
AMDGPUImageDMaskIntrinsic;
|
||||||
}
|
}
|
||||||
|
|
||||||
defm int_amdgcn_image_getlod : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>;
|
defm int_amdgcn_image_getlod
|
||||||
|
: AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
|
||||||
|
AMDGPUImageDMaskIntrinsic;
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// getresinfo intrinsics
|
// getresinfo intrinsics
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
foreach dim = AMDGPUDims.All in {
|
foreach dim = AMDGPUDims.All in {
|
||||||
def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
|
def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
|
||||||
: AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>;
|
: AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
|
||||||
|
AMDGPUImageDMaskIntrinsic;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
|
||||||
|
tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
|
||||||
|
add_public_tablegen_target(InstCombineTableGen)
|
||||||
|
|
||||||
add_llvm_library(LLVMInstCombine
|
add_llvm_library(LLVMInstCombine
|
||||||
InstructionCombining.cpp
|
InstructionCombining.cpp
|
||||||
InstCombineAddSub.cpp
|
InstCombineAddSub.cpp
|
||||||
|
|
|
@ -706,6 +706,10 @@ private:
|
||||||
/// demanded bits.
|
/// demanded bits.
|
||||||
bool SimplifyDemandedInstructionBits(Instruction &Inst);
|
bool SimplifyDemandedInstructionBits(Instruction &Inst);
|
||||||
|
|
||||||
|
Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
||||||
|
APInt DemandedElts,
|
||||||
|
int DmaskIdx = -1);
|
||||||
|
|
||||||
Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||||
APInt &UndefElts, unsigned Depth = 0);
|
APInt &UndefElts, unsigned Depth = 0);
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,17 @@ using namespace llvm::PatternMatch;
|
||||||
|
|
||||||
#define DEBUG_TYPE "instcombine"
|
#define DEBUG_TYPE "instcombine"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
struct AMDGPUImageDMaskIntrinsic {
|
||||||
|
unsigned Intr;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
|
||||||
|
#include "InstCombineTables.inc"
|
||||||
|
|
||||||
|
} // end anonymous namespace
|
||||||
|
|
||||||
/// Check to see if the specified operand of the specified instruction is a
|
/// Check to see if the specified operand of the specified instruction is a
|
||||||
/// constant integer. If so, check to see if there are any bits set in the
|
/// constant integer. If so, check to see if there are any bits set in the
|
||||||
/// constant that are not demanded. If so, shrink the constant and return true.
|
/// constant that are not demanded. If so, shrink the constant and return true.
|
||||||
|
@ -909,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
|
||||||
|
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
||||||
|
APInt DemandedElts,
|
||||||
|
int DMaskIdx) {
|
||||||
|
unsigned VWidth = II->getType()->getVectorNumElements();
|
||||||
|
if (VWidth == 1)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
ConstantInt *NewDMask = nullptr;
|
||||||
|
|
||||||
|
if (DMaskIdx < 0) {
|
||||||
|
// Pretend that a prefix of elements is demanded to simplify the code
|
||||||
|
// below.
|
||||||
|
DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
|
||||||
|
} else {
|
||||||
|
ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx));
|
||||||
|
if (!DMask)
|
||||||
|
return nullptr; // non-constant dmask is not supported by codegen
|
||||||
|
|
||||||
|
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
|
||||||
|
|
||||||
|
// Mask off values that are undefined because the dmask doesn't cover them
|
||||||
|
DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
|
||||||
|
|
||||||
|
unsigned NewDMaskVal = 0;
|
||||||
|
unsigned OrigLoadIdx = 0;
|
||||||
|
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
|
||||||
|
const unsigned Bit = 1 << SrcIdx;
|
||||||
|
if (!!(DMaskVal & Bit)) {
|
||||||
|
if (!!(DemandedElts & (1 << OrigLoadIdx)))
|
||||||
|
NewDMaskVal |= Bit;
|
||||||
|
OrigLoadIdx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DMaskVal != NewDMaskVal)
|
||||||
|
NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Handle 3 vectors when supported in code gen.
|
||||||
|
unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation());
|
||||||
|
if (!NewNumElts)
|
||||||
|
return UndefValue::get(II->getType());
|
||||||
|
|
||||||
|
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
|
||||||
|
if (NewDMask)
|
||||||
|
II->setArgOperand(DMaskIdx, NewDMask);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine the overload types of the original intrinsic.
|
||||||
|
auto IID = II->getIntrinsicID();
|
||||||
|
SmallVector<Intrinsic::IITDescriptor, 16> Table;
|
||||||
|
getIntrinsicInfoTableEntries(IID, Table);
|
||||||
|
ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
|
||||||
|
|
||||||
|
FunctionType *FTy = II->getCalledFunction()->getFunctionType();
|
||||||
|
SmallVector<Type *, 6> OverloadTys;
|
||||||
|
Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys);
|
||||||
|
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
|
||||||
|
Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys);
|
||||||
|
|
||||||
|
// Get the new return type overload of the intrinsic.
|
||||||
|
Module *M = II->getParent()->getParent()->getParent();
|
||||||
|
Type *EltTy = II->getType()->getVectorElementType();
|
||||||
|
Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);
|
||||||
|
|
||||||
|
OverloadTys[0] = NewTy;
|
||||||
|
Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
|
||||||
|
|
||||||
|
SmallVector<Value *, 16> Args;
|
||||||
|
for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
|
||||||
|
Args.push_back(II->getArgOperand(I));
|
||||||
|
|
||||||
|
if (NewDMask)
|
||||||
|
Args[DMaskIdx] = NewDMask;
|
||||||
|
|
||||||
|
IRBuilderBase::InsertPointGuard Guard(Builder);
|
||||||
|
Builder.SetInsertPoint(II);
|
||||||
|
|
||||||
|
CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
|
||||||
|
NewCall->takeName(II);
|
||||||
|
NewCall->copyMetadata(*II);
|
||||||
|
|
||||||
|
if (NewNumElts == 1) {
|
||||||
|
return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
|
||||||
|
DemandedElts.countTrailingZeros());
|
||||||
|
}
|
||||||
|
|
||||||
|
SmallVector<uint32_t, 8> EltMask;
|
||||||
|
unsigned NewLoadIdx = 0;
|
||||||
|
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
|
||||||
|
if (!!(DemandedElts & (1 << OrigLoadIdx)))
|
||||||
|
EltMask.push_back(NewLoadIdx++);
|
||||||
|
else
|
||||||
|
EltMask.push_back(NewNumElts);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *Shuffle =
|
||||||
|
Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
|
||||||
|
|
||||||
|
return Shuffle;
|
||||||
|
}
|
||||||
|
|
||||||
/// The specified value produces a vector with any number of elements.
|
/// The specified value produces a vector with any number of elements.
|
||||||
/// DemandedElts contains the set of elements that are actually used by the
|
/// DemandedElts contains the set of elements that are actually used by the
|
||||||
/// caller. This method analyzes which elements of the operand are undef and
|
/// caller. This method analyzes which elements of the operand are undef and
|
||||||
|
@ -1267,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||||
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
|
||||||
if (!II) break;
|
if (!II) break;
|
||||||
switch (II->getIntrinsicID()) {
|
switch (II->getIntrinsicID()) {
|
||||||
default: break;
|
|
||||||
|
|
||||||
case Intrinsic::x86_xop_vfrcz_ss:
|
case Intrinsic::x86_xop_vfrcz_ss:
|
||||||
case Intrinsic::x86_xop_vfrcz_sd:
|
case Intrinsic::x86_xop_vfrcz_sd:
|
||||||
// The instructions for these intrinsics are speced to zero upper bits not
|
// The instructions for these intrinsics are speced to zero upper bits not
|
||||||
|
@ -1582,79 +1695,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||||
case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
|
case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
|
||||||
|
|
||||||
case Intrinsic::amdgcn_image_getlod: {
|
case Intrinsic::amdgcn_image_getlod: {
|
||||||
if (VWidth == 1 || !DemandedElts.isMask())
|
|
||||||
return nullptr;
|
|
||||||
|
|
||||||
// TODO: Handle 3 vectors when supported in code gen.
|
|
||||||
unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
|
|
||||||
if (NewNumElts == VWidth)
|
|
||||||
return nullptr;
|
|
||||||
|
|
||||||
Module *M = II->getParent()->getParent()->getParent();
|
|
||||||
Type *EltTy = V->getType()->getVectorElementType();
|
|
||||||
|
|
||||||
Type *NewTy = (NewNumElts == 1) ? EltTy :
|
|
||||||
VectorType::get(EltTy, NewNumElts);
|
|
||||||
|
|
||||||
auto IID = II->getIntrinsicID();
|
auto IID = II->getIntrinsicID();
|
||||||
|
|
||||||
bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
|
bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
|
||||||
IID == Intrinsic::amdgcn_buffer_load_format;
|
IID == Intrinsic::amdgcn_buffer_load_format;
|
||||||
|
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts,
|
||||||
|
IsBuffer ? -1 : 3);
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
|
||||||
|
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
|
||||||
|
|
||||||
Function *NewIntrin = IsBuffer ?
|
|
||||||
Intrinsic::getDeclaration(M, IID, NewTy) :
|
|
||||||
// Samplers have 3 mangled types.
|
|
||||||
Intrinsic::getDeclaration(M, IID,
|
|
||||||
{ NewTy, II->getArgOperand(0)->getType(),
|
|
||||||
II->getArgOperand(1)->getType()});
|
|
||||||
|
|
||||||
SmallVector<Value *, 5> Args;
|
|
||||||
for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
|
|
||||||
Args.push_back(II->getArgOperand(I));
|
|
||||||
|
|
||||||
IRBuilderBase::InsertPointGuard Guard(Builder);
|
|
||||||
Builder.SetInsertPoint(II);
|
|
||||||
|
|
||||||
CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
|
|
||||||
NewCall->takeName(II);
|
|
||||||
NewCall->copyMetadata(*II);
|
|
||||||
|
|
||||||
if (!IsBuffer) {
|
|
||||||
ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
|
|
||||||
if (DMask) {
|
|
||||||
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
|
|
||||||
|
|
||||||
unsigned PopCnt = 0;
|
|
||||||
unsigned NewDMask = 0;
|
|
||||||
for (unsigned I = 0; I < 4; ++I) {
|
|
||||||
const unsigned Bit = 1 << I;
|
|
||||||
if (!!(DMaskVal & Bit)) {
|
|
||||||
if (++PopCnt > NewNumElts)
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
NewDMask |= Bit;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (NewNumElts == 1) {
|
|
||||||
return Builder.CreateInsertElement(UndefValue::get(V->getType()),
|
|
||||||
NewCall, static_cast<uint64_t>(0));
|
|
||||||
}
|
|
||||||
|
|
||||||
SmallVector<uint32_t, 8> EltMask;
|
|
||||||
for (unsigned I = 0; I < VWidth; ++I)
|
|
||||||
EltMask.push_back(I);
|
|
||||||
|
|
||||||
Value *Shuffle = Builder.CreateShuffleVector(
|
|
||||||
NewCall, UndefValue::get(NewTy), EltMask);
|
|
||||||
|
|
||||||
MadeChange = true;
|
|
||||||
return Shuffle;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
include "llvm/TableGen/SearchableTable.td"
|
||||||
|
include "llvm/IR/Intrinsics.td"
|
||||||
|
|
||||||
|
def AMDGPUImageDMaskIntrinsicTable : GenericTable {
|
||||||
|
let FilterClass = "AMDGPUImageDMaskIntrinsic";
|
||||||
|
let Fields = ["Intr"];
|
||||||
|
|
||||||
|
let PrimaryKey = ["Intr"];
|
||||||
|
let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
|
||||||
|
let PrimaryKeyEarlyOut = 1;
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue