forked from OSchip/llvm-project
[SVE][LoopVectorize] Optimise code generated by widenPHIInstruction
For SVE, when scalarising the PHI instruction the whole vector part is generated as opposed to creating instructions for each lane for fixed- width vectors. However, in some cases the lane values may be needed later (e.g for a load instruction) so we still need to calculate these values to avoid extractelement being called on the vector part. Differential Revision: https://reviews.llvm.org/D109445
This commit is contained in:
parent
788e7b3b8c
commit
9d1bea9c88
|
@ -4765,6 +4765,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
|
|||
Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
|
||||
|
||||
if (NeedsVectorIndex) {
|
||||
// Here we cache the whole vector, which means we can support the
|
||||
// extraction of any lane. However, in some cases the extractelement
|
||||
// instruction that is generated for scalar uses of this vector (e.g.
|
||||
// a load instruction) is not folded away. Therefore we still
|
||||
// calculate values for the first n lanes to avoid redundant moves
|
||||
// (when extracting the 0th element) and to produce scalar code (i.e.
|
||||
// additional add/gep instructions instead of expensive extractelement
|
||||
// instructions) when extracting higher-order elements.
|
||||
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
|
||||
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
|
||||
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
|
||||
|
@ -4772,9 +4780,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
|
|||
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
|
||||
SclrGep->setName("next.gep");
|
||||
State.set(PhiR, SclrGep, Part);
|
||||
// We've cached the whole vector, which means we can support the
|
||||
// extraction of any lane.
|
||||
continue;
|
||||
}
|
||||
|
||||
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
|
||||
|
|
|
@ -50,22 +50,25 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
|
|||
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP7]]
|
||||
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], <vscale x 2 x i64> [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP4]], i64 1
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8** [[TMP10]] to <vscale x 2 x i8*>*
|
||||
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP9]], <vscale x 2 x i8*>* [[TMP11]], align 8
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i8*> [[NEXT_GEP4]], i32 0
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[TMP12]], i32 0
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <vscale x 2 x i8>*
|
||||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP14]], align 1
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <vscale x 2 x i8>*
|
||||
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP15]], <vscale x 2 x i8>* [[TMP16]], align 1
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
|
||||
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1
|
||||
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP4]], i64 1
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to <vscale x 2 x i8*>*
|
||||
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP11]], <vscale x 2 x i8*>* [[TMP13]], align 8
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i32 0
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
|
||||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP15]], align 1
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
|
||||
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP16]], <vscale x 2 x i8>* [[TMP17]], align 1
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: middle.block:
|
||||
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
||||
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
|
||||
|
|
|
@ -131,6 +131,7 @@ define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
|
|||
; CHECK-NEXT: %[[TMP2:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[STEPVEC]]
|
||||
; CHECK-NEXT: %[[APTRS1:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND1]]
|
||||
; CHECK-NEXT: %[[GEPA1:.*]] = getelementptr i32, i32* %a, i64 %[[IDX]]
|
||||
; CHECK-NEXT: %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64()
|
||||
; CHECK-NEXT: %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1
|
||||
; CHECK-NEXT: %[[TMP3:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[VSCALE64X2]], i32 0
|
||||
|
@ -139,6 +140,10 @@ define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
|
|||
; CHECK-NEXT: %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[TMP5]]
|
||||
; CHECK-NEXT: %[[APTRS2:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND2]]
|
||||
; CHECK-NEXT: %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]]
|
||||
; The following checks that there is no extractelement after
|
||||
; vectorization when the stepvector has multiple uses, which demonstrates
|
||||
; the removal of a redundant fmov instruction in the generated asm code.
|
||||
; CHECK-NOT: %[[EXTRACT:.*]] = extractelement <vscale x 2 x i32*> [[APTRS1]], i32 0
|
||||
; CHECK: %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to <vscale x 2 x i32*>*
|
||||
; CHECK-NEXT: store <vscale x 2 x i32*> %[[APTRS1]], <vscale x 2 x i32*>* %[[BPTR1]], align 8
|
||||
; CHECK: %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32()
|
||||
|
|
Loading…
Reference in New Issue