diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 252eea660dc1..3abd1890a4ab 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2494,13 +2494,22 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, // Get the backedge taken count and truncate or extended to the AR type. Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty); - auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), - Intrinsic::umul_with_overflow, Ty); // Compute |Step| * Backedge - CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); - Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); - Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); + Value *MulV, *OfMul; + if (Step->isOne()) { + // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't + // needed, there is never an overflow, so to avoid artificially inflating + // the cost of the check, directly emit the optimized IR. + MulV = TruncTripCount; + OfMul = ConstantInt::getFalse(MulV->getContext()); + } else { + auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), + Intrinsic::umul_with_overflow, Ty); + CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); + MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); + OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow"); + } // Compute: // Start + |Step| * Backedge < Start diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll index 2b199450c46c..e12411c77a04 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -62,39 +62,35 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: -; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP0]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]] -; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDEX]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[ADD_US]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 ; CHECK-NEXT: store i32 [[TMP21]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 ; CHECK-NEXT: store i32 [[TMP22]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 ; CHECK-NEXT: store i32 [[TMP23]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i32 3 -; CHECK-NEXT: store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll index 4c07fce65e76..8cf46b49ed11 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -645,41 +645,37 @@ define void @sink_dominance(i32* %ptr, i32 %N) { ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1) ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]] -; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP10]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP11]] to <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]] -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] @@ -736,43 +732,39 @@ define void @sink_dominance_2(i32* %ptr, i32 %N) { ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1) ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1 -; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]] -; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP10]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP11]] to <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[TMP12]], -; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP12]], <4 x i32> [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4 +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = mul <4 x i32> [[TMP12]], +; CHECK-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> [[TMP11]], <4 x i32> [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]] -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll index 560a073cd146..3575c2129b59 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45259.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll @@ -24,17 +24,13 @@ define i8 @widget(i8* %arr, i8 %t9) { ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[T1_0_LCSSA1]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[ARR2]] ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i8 -; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP5]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = add i8 1, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP7:%.*]] = sub i8 1, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP6:%.*]] = add i8 1, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i8 1, [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i8 [[TMP7]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = icmp slt i8 [[TMP6]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP4]], 255 ; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]] -; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] @@ -45,18 +41,18 @@ define i8 @widget(i8* %arr, i8 %t9) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i8> [[VEC_IND]], -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i8> [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp slt <4 x i8> [[TMP13]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP17:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i8> -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP15]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[TMP17]], <4 x i8>* [[TMP19]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i8> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp slt <4 x i8> [[TMP12]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i8> +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP16]], <4 x i8>* [[TMP18]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll index 10b61cc9f3e7..f9e28b31baab 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll @@ -7,49 +7,31 @@ define void @test(float* %A, i32 %x) { ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[X:%.*]], 1 -; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 undef) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = add i32 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]] -; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[IDENT_CHECK]], [[TMP4]] -; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 undef) -; CHECK-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 1, [[MUL_RESULT2]] -; CHECK-NEXT: [[TMP7:%.*]] = sub i32 1, [[MUL_RESULT2]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i32 [[TMP7]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP6]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW3]] -; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[X]] -; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP19]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDEX]] to i32 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], 0 -; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[X]] -; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP26]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[X]] +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll index fef9b10ba156..e8b4c2983460 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll @@ -20,24 +20,20 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i2 -; CHECK-NEXT: [[MUL:%.*]] = call { i2, i1 } @llvm.umul.with.overflow.i2(i2 1, i2 [[TMP1]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i2, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i2, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i2 [[TMP3]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i2 [[TMP2]], 0 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP0]], 3 ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[TMP10]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] @@ -49,20 +45,20 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = urem i32 [[TMP12]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4, !alias.scope !0 -; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* [[TMP20]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = urem i32 [[TMP11]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP15]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP16]], <2 x i32>* [[TMP19]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -113,24 +109,20 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i2 -; CHECK-NEXT: [[MUL:%.*]] = call { i2, i1 } @llvm.umul.with.overflow.i2(i2 1, i2 [[TMP1]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i2, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i2, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i2 [[TMP3]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i2 [[TMP2]], 0 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP0]], 3 ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[TMP10]], 1 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]] @@ -142,20 +134,20 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = urem i32 [[TMP12]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4, !alias.scope !8, !noalias !11 -; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* [[TMP20]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = urem i32 [[TMP11]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP15]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP16]], <2 x i32>* [[TMP19]], align 4, !alias.scope !11 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -285,35 +277,31 @@ define void @clamped_index_equal_dependence(i32* %A, i32* %B, i32 %N) { ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i2 -; CHECK-NEXT: [[MUL:%.*]] = call { i2, i1 } @llvm.umul.with.overflow.i2(i2 1, i2 [[TMP1]]) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i2, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i2, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i2 [[TMP3]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i2 [[TMP2]], 0 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP0]], 3 ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = urem i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP14]], <2 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = urem i32 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP13]], <2 x i32>* [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]