[LoopVectorizer] Don't perform interleaving of predicated scalar loops

The vectorizer will choose at times to "vectorize" loops with a scalar factor (VF=1) with interleaving (IC > 1). This can occasionally produce better code than the unroller (notable for reductions where it can produce independent reduction chains that are combined after the loop). At times this is not very beneficial though, for example when runtime checks are needed or when the scalar code requires predication. This addresses the second point, preventing the vectorizer from interleaving when the scalar loop will require predication. This prevents it from making a bit of a mess, that is worse than the original and better left for the unroller to unroll if beneficial. It helps reverse some of the regressions from D118090. Differential Revision: https://reviews.llvm.org/D118566
2022-02-07 19:34:28 +00:00 · 2022-02-07 19:34:28 +00:00 · b4c6d1bb37
parent d5a2944219
commit b4c6d1bb37
2 changed files with 19 additions and 91 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -6136,9 +6136,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
    return IC;
  }

-  // Note that if we've already vectorized the loop we will have done the
-  // runtime check and so interleaving won't require further checks.
-  bool InterleavingRequiresRuntimePointerCheck =
+  // For any scalar loop that either requires runtime checks or predication we
+  // are better off leaving this to the unroller. Note that if we've already
+  // vectorized the loop we will have done the runtime check and so interleaving
+  // won't require further checks.
+  bool ScalarInterleavingRequiresPredication =
+      (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
+         return Legal->blockNeedsPredication(BB);
+       }));
+  bool ScalarInterleavingRequiresRuntimePointerCheck =
      (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);

  // We want to interleave small loops in order to reduce the loop overhead and
@ -6148,7 +6154,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
                    << "LV: VF is " << VF << '\n');
  const bool AggressivelyInterleaveReductions =
      TTI.enableAggressiveInterleaving(HasReductions);
-  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+  if (!ScalarInterleavingRequiresRuntimePointerCheck &&
+      !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
    // We assume that the cost overhead is 1 and we use the cost model
    // to estimate the cost of the loop and interleave until the cost of the
    // loop overhead is about 5% of the cost of the loop.
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalar_interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalar_interleave.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -S -o - < %s | FileCheck %s
+; RUN: opt -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -o - < %s | FileCheck %s

 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-arm-none-eabi"
@ -47,90 +48,10 @@ define void @arm_correlate_f16(half* nocapture noundef readonly %pSrcA, i32 noun
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 1, [[IF_END12]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ]
 ; CHECK-NEXT:    [[I_077:%.*]] = phi i32 [ 0, [[IF_END12]] ], [ [[INC33:%.*]], [[FOR_END]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_176:%.*]] = phi half* [ [[PDST_ADDR_0]], [[IF_END12]] ], [ [[PDST_ADDR_2:%.*]], [[FOR_END]] ]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[I_077]])
-; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[I_077]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], [[I_077]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[INDVARS_IV]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE9:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi half [ 0xH0000, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE9]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi half [ 0xH0000, [[VECTOR_PH]] ], [ [[PREDPHI10:%.*]], [[PRED_LOAD_CONTINUE9]] ]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[INDUCTION2:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[I_077]], [[INDUCTION]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[I_077]], [[INDUCTION2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP5]], [[SRCBLEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP6]], [[SRCBLEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[INDUCTION]], [[SRCALEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i32 [[INDUCTION2]], [[SRCALEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[INDUCTION]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds half, half* [[PIN1_0]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi half [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[INDUCTION2]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds half, half* [[PIN1_0]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
-; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi half [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
-; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP21:%.*]] = sub nsw i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds half, half* [[PIN2_0]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load half, half* [[TMP23]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
-; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi half [ poison, [[PRED_LOAD_CONTINUE5]] ], [ [[TMP24]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9]]
-; CHECK:       pred.load.if8:
-; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw i32 0, [[TMP6]]
-; CHECK-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds half, half* [[PIN2_0]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load half, half* [[TMP28]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; CHECK:       pred.load.continue9:
-; CHECK-NEXT:    [[TMP30:%.*]] = phi half [ poison, [[PRED_LOAD_CONTINUE7]] ], [ [[TMP29]], [[PRED_LOAD_IF8]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast half [[TMP25]], [[TMP16]]
-; CHECK-NEXT:    [[TMP32:%.*]] = fmul fast half [[TMP30]], [[TMP20]]
-; CHECK-NEXT:    [[TMP33:%.*]] = fadd fast half [[TMP31]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fadd fast half [[TMP32]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP11]], true
-; CHECK-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP12]], true
-; CHECK-NEXT:    [[PREDPHI]] = select i1 [[TMP35]], half [[VEC_PHI]], half [[TMP33]]
-; CHECK-NEXT:    [[PREDPHI10]] = select i1 [[TMP36]], half [[VEC_PHI3]], half [[TMP34]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast half [[PREDPHI10]], [[PREDPHI]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[INDVARS_IV]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND14_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, [[VECTOR_SCEVCHECK]] ], [ 0xH0000, [[FOR_COND14_PREHEADER]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
 ; CHECK:       for.body16:
-; CHECK-NEXT:    [[J_074:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[SUM_073:%.*]] = phi half [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[J_074:%.*]] = phi i32 [ 0, [[FOR_COND14_PREHEADER]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[SUM_073:%.*]] = phi half [ 0xH0000, [[FOR_COND14_PREHEADER]] ], [ [[SUM_1:%.*]], [[FOR_INC]] ]
 ; CHECK-NEXT:    [[SUB17:%.*]] = sub i32 [[I_077]], [[J_074]]
 ; CHECK-NEXT:    [[CMP18:%.*]] = icmp ult i32 [[SUB17]], [[SRCBLEN_ADDR_0]]
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp ult i32 [[J_074]], [[SRCALEN_ADDR_0]]
@ -139,21 +60,21 @@ define void @arm_correlate_f16(half* nocapture noundef readonly %pSrcA, i32 noun
 ; CHECK:       if.then20:
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[J_074]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, half* [[PIN1_0]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP38:%.*]] = load half, half* [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load half, half* [[ARRAYIDX]], align 2
 ; CHECK-NEXT:    [[SUB22:%.*]] = sub nsw i32 0, [[SUB17]]
 ; CHECK-NEXT:    [[IDXPROM23:%.*]] = sext i32 [[SUB22]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds half, half* [[PIN2_0]], i64 [[IDXPROM23]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load half, half* [[ARRAYIDX24]], align 2
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast half [[TMP39]], [[TMP38]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[ARRAYIDX24]], align 2
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast half [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[ADD25:%.*]] = fadd fast half [[MUL]], [[SUM_073]]
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[SUM_1]] = phi half [ [[ADD25]], [[IF_THEN20]] ], [ [[SUM_073]], [[FOR_BODY16]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_074]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY16]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY16]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi half [ [[SUM_1]], [[FOR_INC]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi half [ [[SUM_1]], [[FOR_INC]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_2]] = getelementptr inbounds half, half* [[PDST_ADDR_176]], i64 [[CMP27]]
 ; CHECK-NEXT:    store half [[SUM_1_LCSSA]], half* [[PDST_ADDR_176]], align 2
 ; CHECK-NEXT:    [[INC33]] = add nuw i32 [[I_077]], 1