forked from OSchip/llvm-project
[SLP]Improve vectorization of split loads.
Need to fix ther cost estimation for split loads, since we look at the subregs already, no need to permute them, need just to estimate subregister insert, if it is smaller than the real register. Also, using split loads, it might be profitable already to vectorize smaller trees with gathering of the loads. Differential Revision: https://reviews.llvm.org/D107188
This commit is contained in:
parent
59087dce3b
commit
352c46e707
|
@ -4519,6 +4519,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
|
|||
}
|
||||
if (!VectorizedLoads.empty()) {
|
||||
InstructionCost GatherCost = 0;
|
||||
unsigned NumParts = TTI->getNumberOfParts(VecTy);
|
||||
bool NeedInsertSubvectorAnalysis =
|
||||
!NumParts || (VL.size() / VF) > NumParts;
|
||||
// Get the cost for gathered loads.
|
||||
for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
|
||||
if (VectorizedLoads.contains(VL[I]))
|
||||
|
@ -4544,9 +4547,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
|
|||
TTI->getGatherScatterOpCost(
|
||||
Instruction::Load, LoadTy, LI->getPointerOperand(),
|
||||
/*VariableMask=*/false, Alignment, CostKind, LI);
|
||||
// Add the cost for the subvectors shuffling.
|
||||
GatherCost += ((VL.size() - VF) / VF) *
|
||||
TTI->getShuffleCost(TTI::SK_Select, VecTy);
|
||||
if (NeedInsertSubvectorAnalysis) {
|
||||
// Add the cost for the subvectors insert.
|
||||
for (int I = VF, E = VL.size(); I < E; I += VF)
|
||||
GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
|
||||
None, I, LoadTy);
|
||||
}
|
||||
return ReuseShuffleCost + GatherCost - ScalarsCost;
|
||||
}
|
||||
}
|
||||
|
@ -5011,7 +5017,9 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
|
|||
(allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
|
||||
TE->Scalars.size() < Limit ||
|
||||
(TE->getOpcode() == Instruction::ExtractElement &&
|
||||
isFixedVectorShuffle(TE->Scalars, Mask)));
|
||||
isFixedVectorShuffle(TE->Scalars, Mask)) ||
|
||||
(TE->State == TreeEntry::NeedToGather &&
|
||||
TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
|
||||
};
|
||||
|
||||
// We only handle trees of heights 1 and 2.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: opt -S -slp-vectorizer -slp-threshold=-7 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: cat %t | FileCheck -check-prefix=YAML %s
|
||||
; RUN: opt -S -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-18 -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: opt -S -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: cat %t | FileCheck -check-prefix=YAML %s
|
||||
|
||||
|
||||
|
@ -24,11 +24,15 @@ target triple = "aarch64--linux-gnu"
|
|||
;
|
||||
|
||||
; YAML-LABEL: Function: getelementptr_4x32
|
||||
; YAML: --- !Passed
|
||||
; YAML-NEXT: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedList
|
||||
; YAML-NEXT: Function: getelementptr_4x32
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
||||
; YAML-NEXT: - Cost: '11'
|
||||
; YAML-NEXT: - Cost: '6'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '5'
|
||||
; YAML-NEXT: - TreeSize: '3'
|
||||
|
||||
; YAML: --- !Passed
|
||||
; YAML-NEXT: Pass: slp-vectorizer
|
||||
|
@ -36,7 +40,7 @@ target triple = "aarch64--linux-gnu"
|
|||
; YAML-NEXT: Function: getelementptr_4x32
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
||||
; YAML-NEXT: - Cost: '16'
|
||||
; YAML-NEXT: - Cost: '6'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '3'
|
||||
|
||||
|
@ -46,48 +50,45 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
|
|||
; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
||||
; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
||||
; CHECK: for.body.preheader:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[X:%.*]], i32 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[Y:%.*]], i32 2
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i32 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
|
||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; CHECK: for.cond.cleanup.loopexit:
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP20:%.*]], i32 1
|
||||
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
|
||||
; CHECK: for.cond.cleanup:
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
|
||||
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
|
||||
; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T4]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP7]]
|
||||
; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
|
||||
; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP13]]
|
||||
; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP9]]
|
||||
; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
|
||||
; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP15]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP12]]
|
||||
; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
|
||||
; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
|
||||
; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ADD11]], i32 1
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> <i32 1, i32 poison>, i32 [[T12]], i32 1
|
||||
; CHECK-NEXT: [[TMP20]] = add nsw <2 x i32> [[TMP18]], [[TMP19]]
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
|
||||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP21]], [[N]]
|
||||
; CHECK-NEXT: [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
|
||||
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
|
||||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
|
||||
;
|
||||
entry:
|
||||
|
@ -130,12 +131,6 @@ for.body:
|
|||
}
|
||||
|
||||
; YAML-LABEL: Function: getelementptr_2x32
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
||||
; YAML-NEXT: - Cost: '11'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '5'
|
||||
|
||||
; YAML: --- !Passed
|
||||
; YAML-NEXT: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedList
|
||||
|
@ -156,42 +151,38 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
|
|||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i32 1
|
||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; CHECK: for.cond.cleanup.loopexit:
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP17:%.*]], i32 1
|
||||
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
|
||||
; CHECK: for.cond.cleanup:
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
|
||||
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
|
||||
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
|
||||
; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP4]], 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[T4]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP5]]
|
||||
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[T4]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP2]]
|
||||
; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
|
||||
; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP6]]
|
||||
; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
|
||||
; CHECK-NEXT: [[T7:%.*]] = or i32 [[T4]], 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[T7]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[T7]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP3]]
|
||||
; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
|
||||
; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP9]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP8]]
|
||||
; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
|
||||
; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP10]]
|
||||
; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[ADD11]], i32 1
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> <i32 1, i32 poison>, i32 [[T12]], i32 1
|
||||
; CHECK-NEXT: [[TMP17]] = add nsw <2 x i32> [[TMP15]], [[TMP16]]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
|
||||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP18]], [[N]]
|
||||
; CHECK-NEXT: [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
|
||||
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
|
||||
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
|
||||
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
|
||||
;
|
||||
entry:
|
||||
|
|
|
@ -12,13 +12,11 @@ declare void @unknown()
|
|||
define void @test(float * %a, float * %b, float * %c, float * %d) {
|
||||
; CHECK-LABEL: @test(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[L0:%.*]] = load float, float* [[A:%.*]], align 4
|
||||
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
|
||||
; CHECK-NEXT: [[L1:%.*]] = load float, float* [[A1]], align 4
|
||||
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
|
||||
; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
|
||||
; CHECK-NEXT: [[L2:%.*]] = load float, float* [[A2]], align 4
|
||||
; CHECK-NEXT: [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
|
||||
; CHECK-NEXT: [[L3:%.*]] = load float, float* [[A3]], align 4
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: call void @unknown()
|
||||
; CHECK-NEXT: call void @unknown()
|
||||
; CHECK-NEXT: call void @unknown()
|
||||
|
@ -47,23 +45,21 @@ define void @test(float * %a, float * %b, float * %c, float * %d) {
|
|||
; CHECK-NEXT: call void @unknown()
|
||||
; CHECK-NEXT: call void @unknown()
|
||||
; CHECK-NEXT: call void @unknown()
|
||||
; CHECK-NEXT: store float [[L0]], float* [[B:%.*]], align 4
|
||||
; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1
|
||||
; CHECK-NEXT: store float [[L1]], float* [[B1]], align 4
|
||||
; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
|
||||
; CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
|
||||
; CHECK-NEXT: store float [[L2]], float* [[B2]], align 4
|
||||
; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
|
||||
; CHECK-NEXT: store float [[L3]], float* [[B3]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
|
||||
; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1
|
||||
; CHECK-NEXT: [[C2:%.*]] = getelementptr inbounds float, float* [[C]], i64 2
|
||||
; CHECK-NEXT: [[C3:%.*]] = getelementptr inbounds float, float* [[C]], i64 3
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[C]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
|
||||
; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 1
|
||||
; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds float, float* [[D]], i64 2
|
||||
; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds float, float* [[D]], i64 3
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[D]] to <4 x float>*
|
||||
; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[D]] to <4 x float>*
|
||||
; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
|
|
|
@ -131,18 +131,18 @@ define dso_local void @test_unordered_splits(%struct.S* nocapture %p) local_unna
|
|||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
|
||||
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
|
||||
; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
|
||||
; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
|
||||
; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[G20]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
|
||||
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX30]] to <4 x i32>*
|
||||
; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP5]], align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>*
|
||||
; CHECK-NEXT: store <8 x i32> [[TMP7]], <8 x i32>* [[TMP8]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
|
@ -198,30 +198,31 @@ define dso_local void @test_cost_splits(%struct.S* nocapture %p) local_unnamed_a
|
|||
; CHECK-NEXT: [[G21:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P3]], i32 0, i64 13
|
||||
; CHECK-NEXT: [[G22:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 14
|
||||
; CHECK-NEXT: [[G23:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 15
|
||||
; CHECK-NEXT: [[I1:%.*]] = load i32, i32* [[G10]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 0, i64 0
|
||||
; CHECK-NEXT: store i32 [[I1]], i32* [[ARRAYIDX2]], align 4
|
||||
; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[G11]], align 4
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <2 x i32>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
|
||||
; CHECK-NEXT: store i32 [[I3]], i32* [[ARRAYIDX9]], align 4
|
||||
; CHECK-NEXT: [[I5:%.*]] = load i32, i32* [[G12]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
|
||||
; CHECK-NEXT: store i32 [[I5]], i32* [[ARRAYIDX16]], align 4
|
||||
; CHECK-NEXT: [[I7:%.*]] = load i32, i32* [[G13]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
|
||||
; CHECK-NEXT: store i32 [[I7]], i32* [[ARRAYIDX23]], align 4
|
||||
; CHECK-NEXT: [[I9:%.*]] = load i32, i32* [[G20]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
|
||||
; CHECK-NEXT: store i32 [[I9]], i32* [[ARRAYIDX30]], align 4
|
||||
; CHECK-NEXT: [[I11:%.*]] = load i32, i32* [[G21]], align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>*
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
|
||||
; CHECK-NEXT: store i32 [[I11]], i32* [[ARRAYIDX37]], align 4
|
||||
; CHECK-NEXT: [[I13:%.*]] = load i32, i32* [[G22]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
|
||||
; CHECK-NEXT: store i32 [[I13]], i32* [[ARRAYIDX44]], align 4
|
||||
; CHECK-NEXT: [[I15:%.*]] = load i32, i32* [[G23]], align 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[G22]] to <2 x i32>*
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
|
||||
; CHECK-NEXT: store i32 [[I15]], i32* [[ARRAYIDX51]], align 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>*
|
||||
; CHECK-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
|
|
|
@ -166,19 +166,21 @@ define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, fl
|
|||
; CHECK-NEXT: [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
|
||||
; CHECK-NEXT: [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
|
||||
; CHECK-NEXT: store float [[TMP0]], float* [[DST_ADDR_022]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
|
||||
; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX3]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
|
||||
; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
|
||||
; CHECK-NEXT: store float [[TMP3]], float* [[ARRAYIDX7]], align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>*
|
||||
; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
|
||||
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
|
||||
; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
|
||||
; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1
|
||||
|
|
Loading…
Reference in New Issue