[VectorCombine] try to create vector loads from scalar loads

This patch was adjusted to match the most basic pattern that starts with an insertelement (so there's no extract created here). Hopefully, that removes any concern about interfering with other passes. Ie, the transform should almost always be profitable. We could make an argument that this could be part of canonicalization, but we conservatively try not to create vector ops from scalar ops in passes like instcombine. If the transform is not profitable, the backend should be able to re-scalarize the load. Differential Revision: https://reviews.llvm.org/D81766
2020-08-09 08:59:54 -04:00 · 2020-08-09 08:59:54 -04:00 · 43bdac2906
parent c70f0b9d4a
commit 43bdac2906
2 changed files with 95 additions and 17 deletions
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@ -33,6 +34,7 @@ using namespace llvm;
 using namespace llvm::PatternMatch;

 #define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecLoad, "Number of vector loads formed");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@ -65,6 +67,7 @@ private:
  const TargetTransformInfo &TTI;
  const DominatorTree &DT;

+  bool vectorizeLoadInsert(Instruction &I);
  ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                        ExtractElementInst *Ext1,
                                        unsigned PreferredExtractIndex) const;
@ -88,6 +91,61 @@ static void replaceValue(Value &Old, Value &New) {
  New.takeName(&Old);
 }

+bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
+  // Match insert of scalar load.
+  Value *Scalar;
+  if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())))
+    return false;
+  auto *Load = dyn_cast<LoadInst>(Scalar);
+  Type *ScalarTy = Scalar->getType();
+  if (!Load || !Load->isSimple())
+    return false;
+
+  // TODO: Extend this to match GEP with constant offsets.
+  Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
+  assert(isa<PointerType>(PtrOp->getType()) && "Expected a pointer type");
+
+  unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
+  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
+  if (!ScalarSize || VectorSize % ScalarSize != 0)
+    return false;
+
+  // Check safety of replacing the scalar load with a larger vector load.
+  unsigned VecNumElts = VectorSize / ScalarSize;
+  auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false);
+  // TODO: Allow insert/extract subvector if the type does not match.
+  if (VectorTy != I.getType())
+    return false;
+  Align Alignment = Load->getAlign();
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
+    return false;
+
+  // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0
+  int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                    Load->getPointerAddressSpace());
+  APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0);
+  OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false);
+
+  // New pattern: load VecPtr
+  int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment,
+                                    Load->getPointerAddressSpace());
+
+  // We can aggressively convert to the vector form because the backend can
+  // invert this transform if it does not result in a performance win.
+  if (OldCost < NewCost)
+    return false;
+
+  // It is safe and potentially profitable to load a vector directly:
+  // inselt undef, load Scalar, 0 --> load VecPtr
+  IRBuilder<> Builder(Load);
+  Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo());
+  LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+  replaceValue(I, *VecLd);
+  ++NumVecLoad;
+  return true;
+}
+
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a different index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@ -625,6 +683,7 @@ bool VectorCombine::run() {
      if (isa<DbgInfoIntrinsic>(I))
        continue;
      Builder.SetInsertPoint(&I);
+      MadeChange |= vectorizeLoadInsert(I);
      MadeChange |= foldExtractExtract(I);
      MadeChange |= foldBitcastShuf(I);
      MadeChange |= scalarizeBinopOrCmp(I);
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@ -174,8 +174,8 @@ define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable

 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
  %s = load float, float* %p, align 4
@ -185,9 +185,7 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p

 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
  %b = bitcast <4 x float>* %p to float*
@ -196,10 +194,12 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenc
  ret <4 x float> %r
 }

+; Element type does not change cost.
+
 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v4i32(
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
  %s = load i32, i32* %p, align 4
@ -207,11 +207,12 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
  ret <4 x i32> %r
 }

+; Pointer type does not change cost.
+
 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <16 x i8>* [[P:%.*]] to i32*
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
  %b = bitcast <16 x i8>* %p to i32*
@ -220,11 +221,11 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceabl
  ret <4 x i32> %r
 }

+; This is canonical form for vector element access.
+
 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[GEP]], align 16
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
  %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
@ -233,11 +234,13 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenc
  ret <4 x float> %r
 }

+; If there are enough dereferenceable bytes, we can offset the vector load.
+
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
@ -246,6 +249,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
  ret <8 x i16> %r
 }

+; Negative test - can't safely load the offset vector, but could load+shuffle.
+
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
@ -259,11 +264,13 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere
  ret <8 x i16> %r
 }

+; If there are enough dereferenceable bytes, we can offset the vector load.
+
 define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
  %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
@ -272,6 +279,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
  ret <8 x i16> %r
 }

+; Negative test - can't safely load the offset vector, but could load+shuffle.
+
 define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
@ -285,6 +294,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere
  ret <8 x i16> %r
 }

+; Negative test - do not alter volatile.
+
 define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
 ; CHECK-NEXT:    [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4
@ -296,6 +307,8 @@ define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceab
  ret <4 x float> %r
 }

+; Negative test? - pointer is not as aligned as load.
+
 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
@ -307,6 +320,8 @@ define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(1
  ret <4 x float> %r
 }

+; Negative test - not enough bytes.
+
 define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_deref(
 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
@ -318,6 +333,8 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
  ret <4 x float> %r
 }

+; TODO: Should load v4i32.
+
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
@ -329,6 +346,8 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
  ret <8 x i32> %r
 }

+; TODO: Should load v4i32.
+
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*