[SVE][LoopVectorize] Optimise code generated by widenPHIInstruction

For SVE, when scalarising the PHI instruction the whole vector part is
generated as opposed to creating instructions for each lane for fixed-
width vectors. However, in some cases the lane values may be needed
later (e.g for a load instruction) so we still need to calculate
these values to avoid extractelement being called on the vector part.

Differential Revision: https://reviews.llvm.org/D109445
This commit is contained in:
Rosie Sumpter 2021-09-08 13:42:33 +01:00
parent 788e7b3b8c
commit 9d1bea9c88
3 changed files with 32 additions and 19 deletions

View File

@ -4765,6 +4765,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
if (NeedsVectorIndex) {
// Here we cache the whole vector, which means we can support the
// extraction of any lane. However, in some cases the extractelement
// instruction that is generated for scalar uses of this vector (e.g.
// a load instruction) is not folded away. Therefore we still
// calculate values for the first n lanes to avoid redundant moves
// (when extracting the 0th element) and to produce scalar code (i.e.
// additional add/gep instructions instead of expensive extractelement
// instructions) when extracting higher-order elements.
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
@ -4772,9 +4780,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, Part);
// We've cached the whole vector, which means we can support the
// extraction of any lane.
continue;
}
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {

View File

@ -50,22 +50,25 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP7]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], <vscale x 2 x i64> [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP4]], i64 1
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8** [[TMP10]] to <vscale x 2 x i8*>*
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP9]], <vscale x 2 x i8*>* [[TMP11]], align 8
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i8*> [[NEXT_GEP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[TMP12]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP14]], align 1
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <vscale x 2 x i8>*
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP15]], <vscale x 2 x i8>* [[TMP16]], align 1
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP4]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to <vscale x 2 x i8*>*
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP11]], <vscale x 2 x i8*>* [[TMP13]], align 8
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP15]], align 1
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP16]], <vscale x 2 x i8>* [[TMP17]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]

View File

@ -131,6 +131,7 @@ define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
; CHECK-NEXT: %[[TMP2:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[STEPVEC]]
; CHECK-NEXT: %[[APTRS1:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND1]]
; CHECK-NEXT: %[[GEPA1:.*]] = getelementptr i32, i32* %a, i64 %[[IDX]]
; CHECK-NEXT: %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1
; CHECK-NEXT: %[[TMP3:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[VSCALE64X2]], i32 0
@ -139,6 +140,10 @@ define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
; CHECK-NEXT: %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[TMP5]]
; CHECK-NEXT: %[[APTRS2:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND2]]
; CHECK-NEXT: %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]]
; The following checks that there is no extractelement after
; vectorization when the stepvector has multiple uses, which demonstrates
; the removal of a redundant fmov instruction in the generated asm code.
; CHECK-NOT: %[[EXTRACT:.*]] = extractelement <vscale x 2 x i32*> [[APTRS1]], i32 0
; CHECK: %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to <vscale x 2 x i32*>*
; CHECK-NEXT: store <vscale x 2 x i32*> %[[APTRS1]], <vscale x 2 x i32*>* %[[BPTR1]], align 8
; CHECK: %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32()