[SLP] Fix vector element size for the store chains

Vector element size could be different for different store chains. This patch prevents wrong computation of maximum number of elements for that case. Differential Revision: https://reviews.llvm.org/D93192
2020-12-13 14:42:25 +03:00 · 2020-12-13 14:42:25 +03:00 · fac7c7ec3c
parent 6c8ded0d8c
commit fac7c7ec3c
2 changed files with 15 additions and 22 deletions
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -6076,7 +6076,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

    // If a vector register can't hold 1 element, we are done.
    unsigned MaxVecRegSize = R.getMaxVecRegSize();
-    unsigned EltSize = R.getVectorElementSize(Stores[0]);
+    unsigned EltSize = R.getVectorElementSize(Operands[0]);
    if (MaxVecRegSize % EltSize != 0)
      continue;

--- a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
@ -23,28 +23,21 @@ define void @foo(i8* %v0, i8* readonly %v1) {
 ; CHECK-NEXT:    [[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9
 ; CHECK-NEXT:    [[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 10
 ; CHECK-NEXT:    [[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 11
-; CHECK-NEXT:    [[T19:%.*]] = load i32, i32* [[T14]], align 4
-; CHECK-NEXT:    [[T23:%.*]] = load i32, i32* [[T18]], align 4
-; CHECK-NEXT:    [[T27:%.*]] = load i32, i32* [[T22]], align 4
-; CHECK-NEXT:    [[T30:%.*]] = load i32, i32* [[T26]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:    [[T20:%.*]] = add nsw i32 [[T19]], 4
-; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], 4
-; CHECK-NEXT:    [[T28:%.*]] = add nsw i32 [[T27]], 6
-; CHECK-NEXT:    [[T31:%.*]] = add nsw i32 [[T30]], 7
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[TMP2]], <i64 4, i64 4>
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[TMP4]], <i64 6, i64 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP8]], align 8
-; CHECK-NEXT:    store i32 [[T20]], i32* [[T21]], align 4
-; CHECK-NEXT:    store i32 [[T24]], i32* [[T25]], align 4
-; CHECK-NEXT:    store i32 [[T28]], i32* [[T29]], align 4
-; CHECK-NEXT:    store i32 [[T31]], i32* [[T32]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 4, i32 4, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i64> [[TMP4]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP6]], <i64 6, i64 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T21]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret void
 ;
  %t0 = bitcast i8* %v0 to i32*