forked from OSchip/llvm-project
Revert rG9ba577eca2e339726bfaad4e615c6324a705b292 "[X86][SSE] canonicalizeShuffleWithBinOps - handle target shuffles. NFCI."
Sorry this wasn't supposed to be committed yet (and certainly not tagged as NFCI....)
This commit is contained in:
parent
5fb43477dc
commit
75a184dacf
|
@ -36814,7 +36814,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
|
||||
// Canonicalize SHUFFLE(BINOP(X,C)) -> BINOP(SHUFFLE(X),SHUFFLE(C)).
|
||||
static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
|
||||
const SDLoc &DL) {
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
|
@ -36822,14 +36822,11 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
|
|||
|
||||
auto IsMergeableWithShuffle = [](SDValue Op) {
|
||||
// AllZeros/AllOnes constants are freely shuffled and will peek through
|
||||
// bitcasts. Other constant build vectors do not peek through bitcasts. Only
|
||||
// merge with target shuffles if it has one use so shuffle combining is
|
||||
// likely to kick in.
|
||||
// bitcasts. Other constant build vectors do not peek through bitcasts.
|
||||
return ISD::isBuildVectorAllOnes(Op.getNode()) ||
|
||||
ISD::isBuildVectorAllZeros(Op.getNode()) ||
|
||||
ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
|
||||
ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
|
||||
(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
|
||||
ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode());
|
||||
};
|
||||
auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
|
||||
// Ensure we only shuffle whole vector src elements, unless its a logical
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1,AVX1-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX1,AVX1-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
|
||||
|
||||
define float @pr26491(<4 x float> %a0) {
|
||||
|
@ -72,11 +72,11 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
|
|||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; SSE2-NEXT: subpd {{.*}}(%rip), %xmm2
|
||||
; SSE2-NEXT: movapd %xmm2, %xmm3
|
||||
; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSE2-NEXT: addpd %xmm3, %xmm2
|
||||
; SSE2-NEXT: divpd %xmm2, %xmm1
|
||||
; SSE2-NEXT: divpd %xmm2, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
|
||||
; SSE2-NEXT: addpd %xmm2, %xmm3
|
||||
; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0]
|
||||
; SSE2-NEXT: divpd %xmm3, %xmm1
|
||||
; SSE2-NEXT: divpd %xmm3, %xmm0
|
||||
; SSE2-NEXT: xorpd %xmm2, %xmm2
|
||||
; SSE2-NEXT: addpd %xmm2, %xmm0
|
||||
; SSE2-NEXT: addpd %xmm2, %xmm1
|
||||
|
@ -87,9 +87,10 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
|
|||
; SSSE3-SLOW-NEXT: movq %rdi, %xmm2
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
|
||||
; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2
|
||||
; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm3 = xmm2[0,0]
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
|
||||
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0]
|
||||
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0
|
||||
; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2
|
||||
|
@ -110,17 +111,31 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
|
|||
; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: PR41414:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovq %rdi, %xmm1
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
; AVX1-SLOW-LABEL: PR41414:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
||||
; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
|
||||
; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1-FAST-LABEL: PR41414:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vmovq %rdi, %xmm1
|
||||
; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
|
||||
; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: PR41414:
|
||||
; AVX2: # %bb.0:
|
||||
|
|
|
@ -364,10 +364,29 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
|
|||
; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
|
||||
; SSSE3_FAST-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: hadd_v4f64:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
|
||||
; AVX-NEXT: retq
|
||||
; AVX1_SLOW-LABEL: hadd_v4f64:
|
||||
; AVX1_SLOW: # %bb.0:
|
||||
; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
|
||||
; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
|
||||
; AVX1_SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1_FAST-LABEL: hadd_v4f64:
|
||||
; AVX1_FAST: # %bb.0:
|
||||
; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
|
||||
; AVX1_FAST-NEXT: retq
|
||||
;
|
||||
; AVX2_SLOW-LABEL: hadd_v4f64:
|
||||
; AVX2_SLOW: # %bb.0:
|
||||
; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
|
||||
; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
|
||||
; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
|
||||
; AVX2_SLOW-NEXT: retq
|
||||
;
|
||||
; AVX2_FAST-LABEL: hadd_v4f64:
|
||||
; AVX2_FAST: # %bb.0:
|
||||
; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
|
||||
; AVX2_FAST-NEXT: retq
|
||||
%a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
|
||||
%a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
|
||||
%hop = fadd <4 x double> %a0, %a1
|
||||
|
@ -438,10 +457,29 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
|
|||
; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1
|
||||
; SSSE3_FAST-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: hsub_v4f64:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
|
||||
; AVX-NEXT: retq
|
||||
; AVX1_SLOW-LABEL: hsub_v4f64:
|
||||
; AVX1_SLOW: # %bb.0:
|
||||
; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
|
||||
; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
|
||||
; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
|
||||
; AVX1_SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1_FAST-LABEL: hsub_v4f64:
|
||||
; AVX1_FAST: # %bb.0:
|
||||
; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
|
||||
; AVX1_FAST-NEXT: retq
|
||||
;
|
||||
; AVX2_SLOW-LABEL: hsub_v4f64:
|
||||
; AVX2_SLOW: # %bb.0:
|
||||
; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
|
||||
; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
|
||||
; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
|
||||
; AVX2_SLOW-NEXT: retq
|
||||
;
|
||||
; AVX2_FAST-LABEL: hsub_v4f64:
|
||||
; AVX2_FAST: # %bb.0:
|
||||
; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
|
||||
; AVX2_FAST-NEXT: retq
|
||||
%a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
|
||||
%a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
|
||||
%hop = fsub <4 x double> %a0, %a1
|
||||
|
|
|
@ -470,8 +470,9 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
|
|||
; SSE-SLOW-LABEL: add_pd_010:
|
||||
; SSE-SLOW: # %bb.0:
|
||||
; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
|
||||
; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
|
||||
; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE-FAST-LABEL: add_pd_010:
|
||||
|
@ -600,10 +601,10 @@ define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
|
|||
define <4 x float> @add_ps_017(<4 x float> %x) {
|
||||
; SSE-SLOW-LABEL: add_ps_017:
|
||||
; SSE-SLOW: # %bb.0:
|
||||
; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
|
||||
; SSE-SLOW-NEXT: addps %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
|
||||
; SSE-SLOW-NEXT: addps %xmm0, %xmm1
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
|
||||
; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE-FAST-LABEL: add_ps_017:
|
||||
|
@ -925,10 +926,10 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
|
|||
define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
|
||||
; SSE-SLOW-LABEL: PR45747_1:
|
||||
; SSE-SLOW: # %bb.0:
|
||||
; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSE-SLOW-NEXT: addps %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; SSE-SLOW-NEXT: addps %xmm0, %xmm1
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2,2,2]
|
||||
; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE-FAST-LABEL: PR45747_1:
|
||||
|
@ -956,10 +957,9 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
|
|||
define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
|
||||
; SSE-SLOW-LABEL: PR45747_2:
|
||||
; SSE-SLOW: # %bb.0:
|
||||
; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; SSE-SLOW-NEXT: addps %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSE-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE-FAST-LABEL: PR45747_2:
|
||||
|
@ -1009,14 +1009,14 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
|
|||
define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
|
||||
; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23:
|
||||
; SSE-SLOW: # %bb.0:
|
||||
; SSE-SLOW-NEXT: movaps %xmm0, %xmm2
|
||||
; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
|
||||
; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
||||
; SSE-SLOW-NEXT: addps %xmm2, %xmm0
|
||||
; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
|
||||
; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; SSE-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]
|
||||
; SSE-SLOW-NEXT: addps %xmm1, %xmm3
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[0,3]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
|
||||
; SSE-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE-FAST-LABEL: PR34724_add_v4f32_0u23:
|
||||
|
@ -1026,9 +1026,14 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
|
|||
;
|
||||
; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3]
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm2
|
||||
; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: PR34724_add_v4f32_0u23:
|
||||
|
|
|
@ -19,20 +19,21 @@
|
|||
define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
|
||||
; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm2[0,0]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
|
||||
|
@ -44,11 +45,17 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
|||
;
|
||||
; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1]
|
||||
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
|
@ -64,11 +71,17 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
|||
;
|
||||
; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32:
|
||||
; AVX2-SLOW: # %bb.0:
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1]
|
||||
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
|
||||
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
|
@ -105,19 +118,21 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
|||
define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
|
||||
; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
|
||||
|
@ -129,18 +144,21 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
|||
;
|
||||
; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,3]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
|
@ -153,20 +171,21 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
|||
;
|
||||
; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
|
||||
; AVX2-SLOW: # %bb.0:
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
|
||||
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
|
||||
; AVX2-SLOW-NEXT: retq
|
||||
%5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
|
||||
%6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
|
||||
|
@ -199,24 +218,27 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
|||
define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) {
|
||||
; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm8
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
|
||||
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm8, %xmm1
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm8
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[3,1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm5[3,1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm8
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0]
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm7
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm6[0,2]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
|
||||
|
@ -240,11 +262,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
|||
;
|
||||
; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3]
|
||||
; AVX1-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm2
|
||||
; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm3
|
||||
|
@ -290,11 +314,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
|||
;
|
||||
; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
|
||||
; AVX2-SLOW: # %bb.0:
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3]
|
||||
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8
|
||||
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
|
||||
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
|
||||
|
@ -385,25 +411,29 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
|
|||
define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) {
|
||||
; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm8
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm0
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm8
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
|
||||
; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
|
||||
; SSSE3-SLOW-NEXT: movdqa %xmm8, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[2,0]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm2[2,0]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm8
|
||||
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm7
|
||||
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
|
||||
|
@ -430,10 +460,13 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
|||
;
|
||||
; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
|
||||
; AVX1-SLOW: # %bb.0:
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1]
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
|
||||
; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm3
|
||||
|
@ -485,10 +518,13 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
|
|||
;
|
||||
; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
|
||||
; AVX2-SLOW: # %bb.0:
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1]
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
|
||||
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
|
||||
|
@ -591,67 +627,77 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
|
|||
; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
|
||||
; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm5, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm2
|
||||
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3]
|
||||
; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm5, %xmm0
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
|
||||
; SSSE3-FAST: # %bb.0:
|
||||
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
|
||||
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
|
||||
; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
|
||||
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm4, %xmm5
|
||||
; SSSE3-FAST-NEXT: addps %xmm5, %xmm1
|
||||
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0
|
||||
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0
|
||||
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm0, %xmm2
|
||||
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3]
|
||||
; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; SSSE3-FAST-NEXT: addps %xmm5, %xmm0
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3]
|
||||
; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
|
||||
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: addps %xmm4, %xmm2
|
||||
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
|
||||
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
|
||||
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: addps %xmm2, %xmm3
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
|
||||
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-FAST-NEXT: addps %xmm4, %xmm3
|
||||
; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
|
||||
; SSSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
||||
; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm5, %xmm4
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
|
||||
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3]
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm4, %xmm1
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
|
@ -664,15 +710,18 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
|
|||
; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3]
|
||||
; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX-FAST-NEXT: vaddps %xmm4, %xmm5, %xmm4
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
|
||||
; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
|
||||
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
|
||||
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3]
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm1, %xmm4, %xmm1
|
||||
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
|
||||
; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
|
@ -716,18 +765,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
|
|||
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm4
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm2
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm4
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm3
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
|
||||
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
|
@ -740,17 +789,17 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
|
|||
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
|
||||
; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1
|
||||
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
|
||||
; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
|
||||
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm2
|
||||
; SSSE3-FAST-NEXT: paddd %xmm2, %xmm4
|
||||
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
|
||||
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3
|
||||
; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
|
||||
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm2
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
|
||||
; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSSE3-FAST-NEXT: retq
|
||||
;
|
||||
|
@ -762,20 +811,22 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
|
|||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
|
||||
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
||||
; AVX1-SLOW-NEXT: retq
|
||||
;
|
||||
|
@ -787,18 +838,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
|
|||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0
|
||||
; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
|
||||
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
|
||||
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1
|
||||
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
|
||||
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
|
@ -810,20 +863,22 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
|
|||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
|
||||
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
|
||||
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX2-SLOW-NEXT: retq
|
||||
;
|
||||
|
@ -835,18 +890,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
|
|||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0
|
||||
; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
|
||||
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
|
||||
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3]
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1
|
||||
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
|
||||
; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
|
||||
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
|
||||
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX2-FAST-NEXT: retq
|
||||
%5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
|
||||
|
@ -1024,28 +1081,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
|
|||
; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm1
|
||||
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
|
||||
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
|
||||
|
@ -1072,19 +1129,23 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
|
|||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1]
|
||||
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
|
||||
; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm2, %xmm2
|
||||
; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
|
||||
|
@ -1118,23 +1179,22 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
|
|||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
|
||||
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
||||
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movdqa %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
|
||||
|
@ -1157,22 +1217,22 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
|
|||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
|
||||
; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
|
||||
; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
||||
; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
|
||||
; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
|
||||
|
|
|
@ -511,21 +511,39 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
|
|||
define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; X86-LABEL: signbits_mask_ashr_smax:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-AVX1-LABEL: signbits_mask_ashr_smax:
|
||||
; X64-AVX1: # %bb.0:
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -551,21 +569,39 @@ declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
|||
define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; X86-LABEL: signbits_mask_ashr_smin:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-AVX1-LABEL: signbits_mask_ashr_smin:
|
||||
; X64-AVX1: # %bb.0:
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -591,21 +627,39 @@ declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
|||
define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; X86-LABEL: signbits_mask_ashr_umax:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-AVX1-LABEL: signbits_mask_ashr_umax:
|
||||
; X64-AVX1: # %bb.0:
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -631,21 +685,39 @@ declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
|
|||
define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) {
|
||||
; X86-LABEL: signbits_mask_ashr_umin:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X86-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X86-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
|
||||
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-AVX1-LABEL: signbits_mask_ashr_umin:
|
||||
; X64-AVX1: # %bb.0:
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3
|
||||
; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
|
||||
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
|
||||
; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
|
||||
; X64-AVX1-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -412,8 +412,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
|
|||
; SSSE3-SLOW-LABEL: phaddd_single_source5:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
|
||||
; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
|
||||
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: phaddd_single_source5:
|
||||
|
@ -425,8 +425,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
|
|||
; AVX-SLOW-LABEL: phaddd_single_source5:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: phaddd_single_source5:
|
||||
|
@ -438,8 +438,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
|
|||
; AVX2-SHUF-LABEL: phaddd_single_source5:
|
||||
; AVX2-SHUF: # %bb.0:
|
||||
; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2]
|
||||
; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX2-SHUF-NEXT: retq
|
||||
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
|
||||
%add = add <4 x i32> %l, %x
|
||||
|
|
|
@ -185,7 +185,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
|
|||
; SSE2-NEXT: addq %rdx, %rax
|
||||
; SSE2-NEXT: leaq (%rax,%rax,8), %rax
|
||||
; SSE2-NEXT: subq %rax, %rsi
|
||||
; SSE2-NEXT: movq %rsi, %xmm1
|
||||
; SSE2-NEXT: movq %rsi, %xmm0
|
||||
; SSE2-NEXT: movq %rdi, %rax
|
||||
; SSE2-NEXT: imulq %r8
|
||||
; SSE2-NEXT: movq %rdx, %rax
|
||||
|
@ -193,10 +193,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
|
|||
; SSE2-NEXT: addq %rdx, %rax
|
||||
; SSE2-NEXT: leaq (%rax,%rax,8), %rax
|
||||
; SSE2-NEXT: subq %rax, %rdi
|
||||
; SSE2-NEXT: movq %rdi, %xmm0
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: movq %rdi, %xmm1
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591]
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71
|
||||
; SSE2-NEXT: movq %rcx, %rax
|
||||
; SSE2-NEXT: imulq %rdx
|
||||
|
@ -208,13 +208,14 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
|
|||
; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax
|
||||
; SSE2-NEXT: addq %rcx, %rax
|
||||
; SSE2-NEXT: movq %rax, %xmm2
|
||||
; SSE2-NEXT: pand %xmm1, %xmm2
|
||||
; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
|
||||
; SSE2-NEXT: pand %xmm0, %xmm2
|
||||
; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3]
|
||||
; SSE2-NEXT: andps %xmm1, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
|
||||
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
|
||||
|
|
|
@ -853,13 +853,13 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
|
|||
; SSE-NEXT: pxor %xmm2, %xmm0
|
||||
; SSE-NEXT: pxor %xmm1, %xmm2
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE-NEXT: pcmpeqd %xmm2, %xmm3
|
||||
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3]
|
||||
; SSE-NEXT: pand %xmm2, %xmm3
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3]
|
||||
; SSE-NEXT: por %xmm3, %xmm0
|
||||
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
|
||||
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
|
||||
; SSE-NEXT: pand %xmm3, %xmm0
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
|
||||
; SSE-NEXT: por %xmm0, %xmm2
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
|
||||
; SSE-NEXT: movdqa %xmm1, (%rdi)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -899,13 +899,13 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
|
|||
; SSE-NEXT: psubq %xmm1, %xmm0
|
||||
; SSE-NEXT: pxor %xmm0, %xmm2
|
||||
; SSE-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSE-NEXT: pcmpeqd %xmm3, %xmm1
|
||||
; SSE-NEXT: pcmpgtd %xmm3, %xmm2
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3]
|
||||
; SSE-NEXT: pand %xmm3, %xmm4
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3]
|
||||
; SSE-NEXT: por %xmm4, %xmm1
|
||||
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
|
||||
; SSE-NEXT: pcmpeqd %xmm3, %xmm2
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; SSE-NEXT: pand %xmm1, %xmm2
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; SSE-NEXT: por %xmm2, %xmm1
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
||||
; SSE-NEXT: movdqa %xmm0, (%rdi)
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
|
|
|
@ -363,8 +363,8 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
|
|||
; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
|
||||
; AMD10H: # %bb.0:
|
||||
; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
||||
; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AMD10H-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0
|
||||
; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
|
||||
; AMD10H-NEXT: packuswb %xmm0, %xmm0
|
||||
; AMD10H-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue