InstCombine/AMDGPU: Add dimension-aware image intrinsics to SimplifyDemanded

Summary: Use the expanded features of the TableGen generic tables to avoid manually adding the combinatorially exploded set of intrinsics. The getAMDGPUImageDimIntrinsic lookup function is early-out, i.e. non-AMDGPU intrinsics will never look at the underlying table. Use a generic approach for getting the new intrinsic overload to keep the code simple, and make the image dmask handling more generic: - handle non-sampler image loads - handle the case where the set of demanded elements is not a prefix There is some overlap between this code and an optimization that happens in the backend during code generation. They currently complement each other: - only the codegen optimization can generate vec3 loads - only the InstCombine optimization can handle D16 The InstCombine optimization also likely covers more cases since the codegen optimization is fairly ad-hoc. Ideally, we'll remove the optimization in codegen once the infrastructure for vec3 is in place (which will probably take a long time). Modify the test cases to use dimension-aware intrinsics. This makes it easier to see that the test coverage for the new intrinsics is equivalent, and the old style intrinsics will be removed in a follow-up commit anyway. Change-Id: I4b91ea661413d13004956fe4ef7d13d41b8ce3ad Reviewers: arsenm, rampitec, majnemer Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D48165 llvm-svn: 335230
2018-06-21 13:37:31 +00:00 · 2018-06-21 13:37:31 +00:00 · b29ee70122
parent 1045928aab
commit b29ee70122
6 changed files with 612 additions and 613 deletions
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@ -803,9 +803,15 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
                           !if(P_.IsAtomic, 0, 1)), 1> {
  AMDGPUDimProfile P = P_;
  AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
  let TargetPrefix = "amdgcn";
 }
 // Marker class for intrinsics with a DMask that determines the returned
 // channels.
 class AMDGPUImageDMaskIntrinsic;
 defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
  //////////////////////////////////////////////////////////////////////////
@ -839,10 +845,14 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
    }
  }
-  defm int_amdgcn_image_load : AMDGPUImageDimIntrinsicsAll<
+  defm int_amdgcn_image_load
-              "LOAD", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand]>;
+    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
-  defm int_amdgcn_image_load_mip : AMDGPUImageDimIntrinsicsNoMsaa<
+                                  [SDNPMemOperand]>,
-              "LOAD_MIP", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand], 1>;
+      AMDGPUImageDMaskIntrinsic;
  defm int_amdgcn_image_load_mip
    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
                                     [IntrReadMem], [SDNPMemOperand], 1>,
      AMDGPUImageDMaskIntrinsic;
  defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
              "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
@ -866,18 +876,22 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
  }
  foreach sample = AMDGPUSampleVariants in {
-    defm int_amdgcn_image_sample # sample.LowerCaseMod :
+    defm int_amdgcn_image_sample # sample.LowerCaseMod
-        AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>;
+      : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
        AMDGPUImageDMaskIntrinsic;
  }
-  defm int_amdgcn_image_getlod : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>;
+  defm int_amdgcn_image_getlod
    : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
      AMDGPUImageDMaskIntrinsic;
  //////////////////////////////////////////////////////////////////////////
  // getresinfo intrinsics
  //////////////////////////////////////////////////////////////////////////
  foreach dim = AMDGPUDims.All in {
    def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
-      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>;
+      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
        AMDGPUImageDMaskIntrinsic;
  }
  //////////////////////////////////////////////////////////////////////////
--- a/llvm/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/llvm/lib/Transforms/InstCombine/CMakeLists.txt
@ -1,3 +1,7 @@
 set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
 tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
 add_public_tablegen_target(InstCombineTableGen)
 add_llvm_library(LLVMInstCombine
  InstructionCombining.cpp
  InstCombineAddSub.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@ -706,6 +706,10 @@ private:
  /// demanded bits.
  bool SimplifyDemandedInstructionBits(Instruction &Inst);
  Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                               APInt DemandedElts,
                                               int DmaskIdx = -1);
  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                    APInt &UndefElts, unsigned Depth = 0);
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@ -23,6 +23,17 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "instcombine"
 namespace {
 struct AMDGPUImageDMaskIntrinsic {
  unsigned Intr;
 };
 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
 #include "InstCombineTables.inc"
 } // end anonymous namespace
 /// Check to see if the specified operand of the specified instruction is a
 /// constant integer. If so, check to see if there are any bits set in the
 /// constant that are not demanded. If so, shrink the constant and return true.
@ -909,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
  return nullptr;
 }
 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
 Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                           APInt DemandedElts,
                                                           int DMaskIdx) {
  unsigned VWidth = II->getType()->getVectorNumElements();
  if (VWidth == 1)
    return nullptr;
  ConstantInt *NewDMask = nullptr;
  if (DMaskIdx < 0) {
    // Pretend that a prefix of elements is demanded to simplify the code
    // below.
    DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
  } else {
    ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx));
    if (!DMask)
      return nullptr; // non-constant dmask is not supported by codegen
    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
    // Mask off values that are undefined because the dmask doesn't cover them
    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
    unsigned NewDMaskVal = 0;
    unsigned OrigLoadIdx = 0;
    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
      const unsigned Bit = 1 << SrcIdx;
      if (!!(DMaskVal & Bit)) {
        if (!!(DemandedElts & (1 << OrigLoadIdx)))
          NewDMaskVal |= Bit;
        OrigLoadIdx++;
      }
    }
    if (DMaskVal != NewDMaskVal)
      NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
  }
  // TODO: Handle 3 vectors when supported in code gen.
  unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation());
  if (!NewNumElts)
    return UndefValue::get(II->getType());
  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
    if (NewDMask)
      II->setArgOperand(DMaskIdx, NewDMask);
    return nullptr;
  }
  // Determine the overload types of the original intrinsic.
  auto IID = II->getIntrinsicID();
  SmallVector<Intrinsic::IITDescriptor, 16> Table;
  getIntrinsicInfoTableEntries(IID, Table);
  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
  FunctionType *FTy = II->getCalledFunction()->getFunctionType();
  SmallVector<Type *, 6> OverloadTys;
  Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys);
  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
    Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys);
  // Get the new return type overload of the intrinsic.
  Module *M = II->getParent()->getParent()->getParent();
  Type *EltTy = II->getType()->getVectorElementType();
  Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);
  OverloadTys[0] = NewTy;
  Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
  SmallVector<Value *, 16> Args;
  for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
    Args.push_back(II->getArgOperand(I));
  if (NewDMask)
    Args[DMaskIdx] = NewDMask;
  IRBuilderBase::InsertPointGuard Guard(Builder);
  Builder.SetInsertPoint(II);
  CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
  NewCall->takeName(II);
  NewCall->copyMetadata(*II);
  if (NewNumElts == 1) {
    return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
                                       DemandedElts.countTrailingZeros());
  }
  SmallVector<uint32_t, 8> EltMask;
  unsigned NewLoadIdx = 0;
  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
    if (!!(DemandedElts & (1 << OrigLoadIdx)))
      EltMask.push_back(NewLoadIdx++);
    else
      EltMask.push_back(NewNumElts);
  }
  Value *Shuffle =
      Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
  return Shuffle;
 }
 /// The specified value produces a vector with any number of elements.
 /// DemandedElts contains the set of elements that are actually used by the
 /// caller. This method analyzes which elements of the operand are undef and
@ -1267,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
    if (!II) break;
    switch (II->getIntrinsicID()) {
    default: break;
    case Intrinsic::x86_xop_vfrcz_ss:
    case Intrinsic::x86_xop_vfrcz_sd:
      // The instructions for these intrinsics are speced to zero upper bits not
@ -1582,79 +1695,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
    case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
    case Intrinsic::amdgcn_image_getlod: {
      if (VWidth == 1 || !DemandedElts.isMask())
        return nullptr;
      // TODO: Handle 3 vectors when supported in code gen.
      unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
      if (NewNumElts == VWidth)
        return nullptr;
      Module *M = II->getParent()->getParent()->getParent();
      Type *EltTy = V->getType()->getVectorElementType();
      Type *NewTy = (NewNumElts == 1) ? EltTy :
        VectorType::get(EltTy, NewNumElts);
      auto IID = II->getIntrinsicID();
      bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
                      IID == Intrinsic::amdgcn_buffer_load_format;
      return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts,
                                                   IsBuffer ? -1 : 3);
    }
    default: {
      if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
        return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
      Function *NewIntrin = IsBuffer ?
        Intrinsic::getDeclaration(M, IID, NewTy) :
        // Samplers have 3 mangled types.
        Intrinsic::getDeclaration(M, IID,
                                  { NewTy, II->getArgOperand(0)->getType(),
                                      II->getArgOperand(1)->getType()});
      SmallVector<Value *, 5> Args;
      for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
        Args.push_back(II->getArgOperand(I));
      IRBuilderBase::InsertPointGuard Guard(Builder);
      Builder.SetInsertPoint(II);
      CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
      NewCall->takeName(II);
      NewCall->copyMetadata(*II);
      if (!IsBuffer) {
        ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
        if (DMask) {
          unsigned DMaskVal = DMask->getZExtValue() & 0xf;
          unsigned PopCnt = 0;
          unsigned NewDMask = 0;
          for (unsigned I = 0; I < 4; ++I) {
            const unsigned Bit = 1 << I;
            if (!!(DMaskVal & Bit)) {
              if (++PopCnt > NewNumElts)
      break;
              NewDMask |= Bit;
            }
          }
          NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
        }
      }
      if (NewNumElts == 1) {
        return Builder.CreateInsertElement(UndefValue::get(V->getType()),
                                           NewCall, static_cast<uint64_t>(0));
      }
      SmallVector<uint32_t, 8> EltMask;
      for (unsigned I = 0; I < VWidth; ++I)
        EltMask.push_back(I);
      Value *Shuffle = Builder.CreateShuffleVector(
        NewCall, UndefValue::get(NewTy), EltMask);
      MadeChange = true;
      return Shuffle;
    }
    }
    break;
--- a/llvm/lib/Transforms/InstCombine/InstCombineTables.td
+++ b/llvm/lib/Transforms/InstCombine/InstCombineTables.td
@ -0,0 +1,11 @@
 include "llvm/TableGen/SearchableTable.td"
 include "llvm/IR/Intrinsics.td"
 def AMDGPUImageDMaskIntrinsicTable : GenericTable {
  let FilterClass = "AMDGPUImageDMaskIntrinsic";
  let Fields = ["Intr"];
  let PrimaryKey = ["Intr"];
  let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
  let PrimaryKeyEarlyOut = 1;
 }
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll