[VectorCombine] allow peeking through GEPs when creating a vector load

This is an enhancement motivated by https://llvm.org/PR16739 (see D92858 for another). We can look through a GEP to find a base pointer that may be safe to use for a vector load. If so, then we shuffle (shift) the necessary vector element over to index 0. Alive2 proof based on 1 of the regression tests: https://alive2.llvm.org/ce/z/yPJLkh The vector translation is independent of endian (verify by changing to leading 'E' in the datalayout string). Differential Revision: https://reviews.llvm.org/D93229
2020-12-18 08:49:05 -05:00 · 2020-12-18 08:49:05 -05:00 · 47aaa99c0e
parent 0336ff0a17
commit 47aaa99c0e
2 changed files with 95 additions and 32 deletions
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@ -93,6 +93,7 @@ static void replaceValue(Value &Old, Value &New) {

 bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
  // Match insert into fixed vector of scalar value.
+  // TODO: Handle non-zero insert index.
  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
  Value *Scalar;
  if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
@ -115,7 +116,6 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
      mustSuppressSpeculation(*Load))
    return false;

-  // TODO: Extend this to match GEP with constant offsets.
  const DataLayout &DL = I.getModule()->getDataLayout();
  Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
  assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
@ -127,10 +127,13 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
  if (AS != SrcPtr->getType()->getPointerAddressSpace())
    SrcPtr = Load->getPointerOperand();

+  // We are potentially transforming byte-sized (8-bit) memory accesses, so make
+  // sure we have all of our type-based constraints in place for this target.
  Type *ScalarTy = Scalar->getType();
  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
  unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
-  if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0)
+  if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
+      ScalarSize % 8 != 0)
    return false;

  // Check safety of replacing the scalar load with a larger vector load.
@ -139,12 +142,45 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
  // we may use a larger value based on alignment attributes.
  unsigned MinVecNumElts = MinVectorSize / ScalarSize;
  auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
-  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
-    return false;
+  unsigned OffsetEltIndex = 0;
+  Align Alignment = Load->getAlign();
+  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
+    // It is not safe to load directly from the pointer, but we can still peek
+    // through gep offsets and check if it safe to load from a base address with
+    // updated alignment. If it is, we can shuffle the element(s) into place
+    // after loading.
+    unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
+    APInt Offset(OffsetBitWidth, 0);
+    SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+    // We want to shuffle the result down from a high element of a vector, so
+    // the offset must be positive.
+    if (Offset.isNegative())
+      return false;
+
+    // The offset must be a multiple of the scalar element to shuffle cleanly
+    // in the element's size.
+    uint64_t ScalarSizeInBytes = ScalarSize / 8;
+    if (Offset.urem(ScalarSizeInBytes) != 0)
+      return false;
+
+    // If we load MinVecNumElts, will our target element still be loaded?
+    OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
+    if (OffsetEltIndex >= MinVecNumElts)
+      return false;
+
+    if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
+      return false;
+
+    // Update alignment with offset value. Note that the offset could be negated
+    // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
+    // negation does not change the result of the alignment calculation.
+    Alignment = commonAlignment(Alignment, Offset.getZExtValue());
+  }

  // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
  // Use the greater of the alignment on the load or its source pointer.
-  Align Alignment = std::max(SrcPtr->getPointerAlignment(DL), Load->getAlign());
+  Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
  Type *LoadTy = Load->getType();
  int OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
@ -153,6 +189,9 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {

  // New pattern: load VecPtr
  int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
+  // Optionally, we are shuffling the loaded vector element(s) into place.
+  if (OffsetEltIndex)
+    NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy);

  // We can aggressively convert to the vector form because the backend can
  // invert this transform if it does not result in a performance win.
@ -168,12 +207,13 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
  // Set everything but element 0 to undef to prevent poison from propagating
  // from the extra loaded memory. This will also optionally shrink/grow the
  // vector from the loaded size to the output size.
-  // We assume this operation has no cost in codegen.
+  // We assume this operation has no cost in codegen if there was no offset.
  // Note that we could use freeze to avoid poison problems, but then we might
  // still need a shuffle to change the vector size.
  unsigned OutputNumElts = Ty->getNumElements();
  SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
-  Mask[0] = 0;
+  assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
+  Mask[0] = OffsetEltIndex;
  VecLd = Builder.CreateShuffleVector(VecLd, Mask);

  replaceValue(I, *VecLd);
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@ -269,14 +269,19 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
  ret <8 x i16> %r
 }

-; Negative test - can't safely load the offset vector, but could load+shuffle.
+; Can't safely load the offset vector, but can load+shuffle if it is profitable.

 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
-; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; CHECK-NEXT:    ret <8 x i16> [[R]]
+; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
+; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; SSE2-NEXT:    ret <8 x i16> [[R]]
+;
+; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
+; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;
  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
  %s = load i16, i16* %gep, align 2
@ -284,14 +289,19 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere
  ret <8 x i16> %r
 }

-; TODO: Verify that alignment of the new load is not over-specified.
+; Verify that alignment of the new load is not over-specified.

 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; CHECK-NEXT:    ret <8 x i16> [[R]]
+; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
+; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; SSE2-NEXT:    ret <8 x i16> [[R]]
+;
+; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
+; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;
  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
  %s = load i16, i16* %gep, align 8
@ -299,6 +309,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2
  ret <8 x i16> %r
 }

+; Negative test - if we are shuffling a load from the base pointer, the address offset
+; must be a multiple of element size.
+; TODO: Could bitcast around this limitation.
+
 define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1
@ -316,10 +330,9 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 derefer

 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) {
 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 12
-; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
  %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12
@ -329,6 +342,10 @@ define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 derefe
  ret <4 x i32> %r
 }

+; Negative test - if we are shuffling a load from the base pointer, the address offset
+; must be a multiple of element size and the offset must be low enough to fit in the vector
+; (bitcasting would not help this case).
+
 define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) {
 ; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 13
@ -608,15 +625,21 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(<1 x i32>* align 16
  ret <8 x i32> %r
 }

-; TODO: Can't safely load the offset vector, but can load+shuffle if it is profitable.
+; Can't safely load the offset vector, but can load+shuffle if it is profitable.

 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
-; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; CHECK-NEXT:    ret <8 x i16> [[R]]
+; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
+; SSE2-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
+; SSE2-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
+; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; SSE2-NEXT:    ret <8 x i16> [[R]]
+;
+; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
+; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    ret <8 x i16> [[R]]
 ;
  %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
  %l = load <2 x i16>, <2 x i16>* %gep, align 8