[AMDGPUPromoteAlloca] Make compatible with opaque pointers

This mainly changes the handling of bitcasts to not check the types being casted from/to -- we should only care about the actual load/store types. The GEP handling is also changed to not care about types, and just make sure that we get an offset corresponding to a vector element. This was a bit of a struggle for me, because this code seems to be pretty sensitive to small changes. The end result seems to produce strictly better results for the existing test coverage though, because we can now deal with more situations involving bitcasts. Differential Revision: https://reviews.llvm.org/D121371
2022-03-10 10:44:19 +01:00 · 2022-03-10 10:44:19 +01:00 · 3ed643ea76
parent c5f34d1692
commit 3ed643ea76
4 changed files with 126 additions and 150 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@ -334,86 +334,49 @@ static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
                              ArrayTy->getNumElements());
 }

-static Value *stripBitcasts(Value *V) {
-  while (Instruction *I = dyn_cast<Instruction>(V)) {
-    if (I->getOpcode() != Instruction::BitCast)
-      break;
-    V = I->getOperand(0);
-  }
-  return V;
-}
-
 static Value *
 calculateVectorIndex(Value *Ptr,
                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
  if (!GEP)
-    return nullptr;
+    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));

  auto I = GEPIdx.find(GEP);
-  return I == GEPIdx.end() ? nullptr : I->second;
+  assert(I != GEPIdx.end() && "Must have entry for GEP!");
+  return I->second;
 }

-static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
-  // FIXME we only support simple cases
-  if (GEP->getNumOperands() != 3)
+static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+                               Type *VecElemTy, const DataLayout &DL) {
+  // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
+  // helper.
+  unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+  MapVector<Value *, APInt> VarOffsets;
+  APInt ConstOffset(BW, 0);
+  if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
+      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
    return nullptr;

-  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
-  if (!I0 || !I0->isZero())
+  unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
+  if (VarOffsets.size() > 1)
    return nullptr;

-  return GEP->getOperand(2);
-}
-
-// Not an instruction handled below to turn into a vector.
-//
-// TODO: Check isTriviallyVectorizable for calls and handle other
-// instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User,
-                             const DataLayout &DL) {
-  switch (Inst->getOpcode()) {
-  case Instruction::Load: {
-    // Currently only handle the case where the Pointer Operand is a GEP.
-    // Also we could not vectorize volatile or atomic loads.
-    LoadInst *LI = cast<LoadInst>(Inst);
-    if (isa<AllocaInst>(User) &&
-        LI->getPointerOperandType() == User->getType() &&
-        isa<VectorType>(LI->getType()))
-      return true;
-
-    Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
-    if (!PtrInst)
-      return false;
-
-    return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
-            PtrInst->getOpcode() == Instruction::BitCast) &&
-           LI->isSimple();
+  if (VarOffsets.size() == 1) {
+    // Only handle cases where we don't need to insert extra arithmetic
+    // instructions.
+    const auto &VarOffset = VarOffsets.front();
+    if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
+      return nullptr;
+    return VarOffset.first;
  }
-  case Instruction::BitCast:
-    return true;
-  case Instruction::Store: {
-    // Must be the stored pointer operand, not a stored value, plus
-    // since it should be canonical form, the User should be a GEP.
-    // Also we could not vectorize volatile or atomic stores.
-    StoreInst *SI = cast<StoreInst>(Inst);
-    if (isa<AllocaInst>(User) &&
-        SI->getPointerOperandType() == User->getType() &&
-        isa<VectorType>(SI->getValueOperand()->getType()))
-      return true;

-    Instruction *UserInst = dyn_cast<Instruction>(User);
-    if (!UserInst)
-      return false;
+  APInt Quot;
+  uint64_t Rem;
+  APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
+  if (Rem != 0)
+    return nullptr;

-    return (SI->getPointerOperand() == User) &&
-           (UserInst->getOpcode() == Instruction::GetElementPtr ||
-            UserInst->getOpcode() == Instruction::BitCast) &&
-           SI->isSimple();
-  }
-  default:
-    return false;
-  }
+  return ConstantInt::get(GEP->getContext(), Quot);
 }

 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
@ -455,73 +418,87 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
  }

  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
-  std::vector<Value *> WorkList;
-  SmallVector<User *, 8> Users(Alloca->users());
-  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
-  Type *VecEltTy = VectorTy->getElementType();
-  while (!Users.empty()) {
-    User *AllocaUser = Users.pop_back_val();
-    User *UseUser = UseUsers.pop_back_val();
-    Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
+  SmallVector<Instruction *> WorkList;
+  SmallVector<Use *, 8> Uses;
+  for (Use &U : Alloca->uses())
+    Uses.push_back(&U);

-    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
-    if (!GEP) {
-      if (!canVectorizeInst(Inst, UseUser, DL))
+  Type *VecEltTy = VectorTy->getElementType();
+  while (!Uses.empty()) {
+    Use *U = Uses.pop_back_val();
+    Instruction *Inst = dyn_cast<Instruction>(U->getUser());
+
+    if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
+      // This is a store of the pointer, not to the pointer.
+      if (isa<StoreInst>(Inst) &&
+          U->getOperandNo() != StoreInst::getPointerOperandIndex())
        return false;

-      if (Inst->getOpcode() == Instruction::BitCast) {
-        Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
-        Type *ToTy = Inst->getType()->getPointerElementType();
-        if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
-            DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
-          continue;
-
-        for (User *CastUser : Inst->users()) {
-          if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
-            continue;
-          Users.push_back(CastUser);
-          UseUsers.push_back(Inst);
-        }
+      Type *AccessTy = getLoadStoreType(Inst);
+      Ptr = Ptr->stripPointerCasts();

+      // Alloca already accessed as vector, leave alone.
+      if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) ==
+                               DL.getTypeStoreSize(AccessTy))
        continue;
-      }

-      WorkList.push_back(AllocaUser);
+      // Check that this is a simple access of a vector element.
+      bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
+                                          : cast<StoreInst>(Inst)->isSimple();
+      if (!IsSimple ||
+          !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL))
+        return false;
+
+      WorkList.push_back(Inst);
      continue;
    }

-    Value *Index = GEPToVectorIndex(GEP);
-
-    // If we can't compute a vector index from this GEP, then we can't
-    // promote this alloca to vector.
-    if (!Index) {
-      LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
-                        << '\n');
-      return false;
+    if (isa<BitCastInst>(Inst)) {
+      // Look through bitcasts.
+      for (Use &U : Inst->uses())
+        Uses.push_back(&U);
+      continue;
    }

-    GEPVectorIdx[GEP] = Index;
-    Users.append(GEP->user_begin(), GEP->user_end());
-    UseUsers.append(GEP->getNumUses(), GEP);
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+      // If we can't compute a vector index from this GEP, then we can't
+      // promote this alloca to vector.
+      Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL);
+      if (!Index) {
+        LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
+                          << '\n');
+        return false;
+      }
+
+      GEPVectorIdx[GEP] = Index;
+      for (Use &U : Inst->uses())
+        Uses.push_back(&U);
+      continue;
+    }
+
+    // Ignore assume-like intrinsics and comparisons used in assumes.
+    if (isAssumeLikeIntrinsic(Inst))
+      continue;
+
+    if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
+          return isAssumeLikeIntrinsic(cast<Instruction>(U));
+        }))
+      continue;
+
+    // Unknown user.
+    return false;
  }

  LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                    << *VectorTy << '\n');

-  for (Value *V : WorkList) {
-    Instruction *Inst = cast<Instruction>(V);
+  for (Instruction *Inst : WorkList) {
    IRBuilder<> Builder(Inst);
    switch (Inst->getOpcode()) {
    case Instruction::Load: {
-      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
-        break;
-
      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      if (!Index)
-        break;
-
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
      Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
@ -533,16 +510,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
    }
    case Instruction::Store: {
      StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AllocaTy ||
-          SI->getValueOperand()->getType()->isVectorTy())
-        break;
-
      Value *Ptr = SI->getPointerOperand();
      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      if (!Index)
-        break;
-
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
      Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
      Value *Elt = SI->getValueOperand();
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@ -72,9 +72,15 @@ define amdgpu_vs void @promote_store_aggr() #0 {
 ; CHECK-NEXT:    [[FOO2:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
 ; CHECK-NEXT:    [[FOO4:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store float [[FOO3]], float* [[FOO4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 1
-; CHECK-NEXT:    store float 2.000000e+00, float* [[FOO5]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1
+; CHECK-NEXT:    store <2 x float> [[TMP6]], <2 x float>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], [2 x float]* [[F1]], align 4
 ; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1
 ; CHECK-NEXT:    store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4
@ -116,13 +122,15 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-NEXT:    store [2 x float] [[FOO3]], [2 x float]* [[F1]], align 4
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, i32* [[I]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], [2 x float]* [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[FOO6:%.*]] = load float, float* [[FOO5]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x float]* [[F1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]]
 ; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16
 ; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, <4 x float>* [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
-; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
-; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0
+; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
 ; CHECK-NEXT:    [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
 ; CHECK-NEXT:    store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
 ; CHECK-NEXT:    ret void
@ -163,17 +171,28 @@ define amdgpu_ps void @promote_double_aggr() #0 {
 ; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
 ; CHECK-NEXT:    store [2 x double] [[FOO5]], [2 x double]* [[S]], align 8
 ; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[FOO7:%.*]] = load double, double* [[FOO6]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
 ; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[FOO9:%.*]] = load double, double* [[FOO8]], align 8
-; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[FOO7]], [[FOO9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
+; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[FOO11:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0
-; CHECK-NEXT:    store double [[FOO10]], double* [[FOO11]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP7]], align 16
 ; CHECK-NEXT:    [[FOO12:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[FOO13:%.*]] = load double, double* [[FOO12]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* [[TMP10]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
 ; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], [2 x double]* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[FOO15:%.*]] = load double, double* [[FOO14]], align 8
-; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[FOO13]], [[FOO15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x double]* [[S]] to <2 x double>*
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1
+; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]]
 ; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
 ; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
 ; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@ -281,14 +281,8 @@ bb13:                                             ; preds = %.preheader
 ; TODO: llvm.assume can be ingored

 ; OPT-LABEL: @vector_read_alloca_bitcast_assume(
-; OPT:      %tmp = alloca <4 x i32>, align 16, addrspace(5)
-; OPT-NEXT: %x = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %tmp, i64 0, i64 0
-; OPT-NEXT: store i32 0, i32 addrspace(5)* %x, align 16
-; OPT-NEXT: %0 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp, align 16
-; OPT-NEXT: %1 = shufflevector <4 x i32> %0, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; OPT-NEXT: store <4 x i32> %1, <4 x i32> addrspace(5)* %tmp, align 16
-; OPT-NEXT: %2 = extractelement <4 x i32> %1, i32 %index
-; OPT-NEXT: store i32 %2, i32 addrspace(1)* %out, align 4
+; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4

 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
 ; GCN-COUNT-4: buffer_store_dword
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@ -113,16 +113,9 @@ entry:
  ret void
 }

-; FIXME: Should be able to promote this. Instcombine should fold the
-; cast in the hasOneUse case so it might not matter in practice
-
 ; OPT-LABEL: @vector_read_bitcast_alloca(
-; OPT: alloca [4 x float]
-; OPT: store float
-; OPT: store float
-; OPT: store float
-; OPT: store float
-; OPT: load float
+; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
+; OPT: store float %0, float addrspace(1)* %out, align 4
 define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)