[PATCH] D105827: [SLP]Workaround for InsertSubVector cost.

The cost of the InsertSubvector shuffle kind cost is not complete and
may end up with just extracts + inserts costs in many cases. Added
a workaround to represent it as a generic PermuteSingleSrc, which is
still pessimistic but better than InsertSubvector.

Differential Revision: https://reviews.llvm.org/D105827
This commit is contained in:
Alexey Bataev 2021-07-16 12:42:19 -07:00
parent 304293d85d
commit 1b18e9ab67
14 changed files with 283 additions and 345 deletions

View File

@ -3885,14 +3885,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
/*Insert*/ true, /*Extract*/ false);
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0)
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
// FIXME: Replace with SK_InsertSubvector once it is properly supported.
unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
Cost += TTI->getShuffleCost(
TargetTransformInfo::SK_InsertSubvector, SrcVecTy, /*Mask*/ None,
Offset,
FixedVectorType::get(SrcVecTy->getElementType(), NumScalars));
else if (!IsIdentity)
TargetTransformInfo::SK_PermuteSingleSrc,
FixedVectorType::get(SrcVecTy->getElementType(), Sz));
} else if (!IsIdentity) {
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,
ShuffleMask);
}
return Cost;
}

View File

@ -22,16 +22,16 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -218,16 +218,16 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -301,16 +301,16 @@ define <4 x float> @log_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -477,16 +477,16 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -519,16 +519,16 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -1010,16 +1010,16 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16

View File

@ -22,16 +22,16 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -218,16 +218,16 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -301,16 +301,16 @@ define <4 x float> @log_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -477,16 +477,16 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -519,16 +519,16 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16
@ -1010,16 +1010,16 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
;
entry:
%0 = load <4 x float>, <4 x float>* %a, align 16

View File

@ -9,25 +9,23 @@
define <8 x float> @ceil_floor(<8 x float> %a) {
; SSE-LABEL: @ceil_floor(
; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 4, i32 5>
; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 4, i32 5>
; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: ret <8 x float> [[R71]]
;
; SLM-LABEL: @ceil_floor(

View File

@ -9,25 +9,23 @@
define <8 x float> @ceil_floor(<8 x float> %a) {
; SSE-LABEL: @ceil_floor(
; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 4, i32 5>
; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 4, i32 5>
; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: ret <8 x float> [[R71]]
;
; SLM-LABEL: @ceil_floor(

View File

@ -232,33 +232,21 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: @ashr_lshr_shl_v8i32(
; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SSE-NEXT: [[TMP3:%.*]] = shl <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: ret <8 x i32> [[R71]]
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R72:%.*]] = shufflevector <8 x i32> [[R51]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: ret <8 x i32> [[R72]]
;
; SLM-LABEL: @ashr_lshr_shl_v8i32(
; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

View File

@ -232,33 +232,21 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: @ashr_lshr_shl_v8i32(
; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SSE-NEXT: [[TMP3:%.*]] = shl <8 x i32> [[A]], [[B]]
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <2 x i32> <i32 6, i32 7>
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: ret <8 x i32> [[R71]]
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[R72:%.*]] = shufflevector <8 x i32> [[R51]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
; SSE-NEXT: ret <8 x i32> [[R72]]
;
; SLM-LABEL: @ashr_lshr_shl_v8i32(
; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s
;
; Check that we can commute operands based on the predicate.
@ -235,48 +235,25 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
}
define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
; SSE-LABEL: @fcmp_ord_uno_v4i32(
; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; SSE-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4
; SSE-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4
; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; SSE-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; SSE-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]]
; SSE-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]]
; SSE-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; SSE-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
; SSE-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1
; SSE-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2
; SSE-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3
; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; SSE-NEXT: ret <4 x i32> [[R]]
;
; AVX-LABEL: @fcmp_ord_uno_v4i32(
; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; AVX-NEXT: ret <4 x i32> [[R]]
; CHECK-LABEL: @fcmp_ord_uno_v4i32(
; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s
;
; Check that we can commute operands based on the predicate.
@ -235,48 +235,25 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
}
define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
; SSE-LABEL: @fcmp_ord_uno_v4i32(
; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
; SSE-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; SSE-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4
; SSE-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4
; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; SSE-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; SSE-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]]
; SSE-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]]
; SSE-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; SSE-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
; SSE-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1
; SSE-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2
; SSE-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3
; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; SSE-NEXT: ret <4 x i32> [[R]]
;
; AVX-LABEL: @fcmp_ord_uno_v4i32(
; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; AVX-NEXT: ret <4 x i32> [[R]]
; CHECK-LABEL: @fcmp_ord_uno_v4i32(
; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1

View File

@ -729,19 +729,19 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_div(
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i32 0
; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <4 x float*> [[TMP4]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP7]], <2 x i64> <i64 8, i64 5>
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP9]], i32 7
; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
; AVX512F-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
@ -752,19 +752,19 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_div(
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i32 0
; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <4 x float*> [[TMP4]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP7]], <2 x i64> <i64 8, i64 5>
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP9]], i32 7
; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>

View File

@ -729,19 +729,19 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
; AVX2-NEXT: ret void
;
; AVX512F-LABEL: @gather_load_div(
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i32 0
; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <4 x float*> [[TMP4]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP7]], <2 x i64> <i64 8, i64 5>
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP9]], i32 7
; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
; AVX512F-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer
; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
@ -752,19 +752,19 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_div(
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i32 0
; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <4 x float*> [[TMP4]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <2 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP7]], <2 x i64> <i64 8, i64 5>
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP9]], i32 7
; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer
; AVX512VL-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>

View File

@ -12,38 +12,38 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1
; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i32> [[TMP2]], <i32 2, i32 3>
; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 1, i32 2, i32 3, i32 4>
; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[CONV31_I]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CONV31_I]], i32 2
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CONV31_I]], i32 3
; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[CONV31_I]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = lshr <2 x i32> [[TMP7]], <i32 6, i32 7>
; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP7]], <i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP4]], <i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP7]], <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP4]], <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_I_I]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i32> [[TMP15]], <16 x i32> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[SHR_4_I_I]], i32 5
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i32> [[TMP15]], <16 x i32> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i32> [[TMP17]], <16 x i32> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@ -56,12 +56,12 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP9]], i32 2
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP9]], i32 1
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP9]], i32 0
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i8> [[TMP22]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
; CHECK-NEXT: [[TMP37:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*

View File

@ -27,7 +27,6 @@ define void @test(i32* nocapture %t2) {
; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
; CHECK-NEXT: [[T28:%.*]] = add nsw i32 [[T15]], [[T9]]
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
@ -41,16 +40,22 @@ define void @test(i32* nocapture %t2) {
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[T28]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4

View File

@ -27,7 +27,6 @@ define void @test(i32* nocapture %t2) {
; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
; CHECK-NEXT: [[T28:%.*]] = add nsw i32 [[T15]], [[T9]]
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
@ -41,16 +40,22 @@ define void @test(i32* nocapture %t2) {
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4