[x86] allow peeking through an extract_subvector to find a splatted operand

The motivating case is seen in "splat4_v8f32_load_store" and based on code in PR42024:
https://bugs.llvm.org/show_bug.cgi?id=42024
(I haven't stepped through the v8i32 sibling test yet to see why that diverged.)

There are other potential improvements visible like allowing scalarization or vector
narrowing.

Differential Revision: https://reviews.llvm.org/D74909
This commit is contained in:
Sanjay Patel 2020-02-20 13:41:56 -05:00
parent da181d4ba0
commit 064cd2ecdb
13 changed files with 173 additions and 234 deletions

View File

@ -12867,6 +12867,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
// TODO: Combine this logic with findEltLoadSrc() used by
// EltsFromConsecutiveLoads().
int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
@ -12882,6 +12884,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
BitOffset %= OpBitWidth;
continue;
}
case ISD::EXTRACT_SUBVECTOR: {
auto *ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
if (!ConstantIdx)
break;
// The extraction index adds to the existing offset.
unsigned EltBitWidth = V.getScalarValueSizeInBits();
unsigned Idx = ConstantIdx->getZExtValue();
unsigned BeginOffset = Idx * EltBitWidth;
BitOffset += BeginOffset;
V = V.getOperand(0);
continue;
}
case ISD::INSERT_SUBVECTOR: {
SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));

View File

@ -1965,7 +1965,7 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
; X64-LABEL: test_mm256_set1_epi64x:
; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <4 x i64> undef, i64 %a0, i32 0

View File

@ -34,7 +34,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; X64-LABEL: funcC:
; X64: # %bb.0: # %entry
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:

View File

@ -2161,11 +2161,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,3,4,2,0,3,4]
; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@ -2176,11 +2176,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@ -4351,17 +4350,13 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x doub
ret <2 x double> %res
}
; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,4,1,4,1,4,1,4]
; CHECK-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2
; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@ -4370,16 +4365,13 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double
ret <2 x double> %res
}
; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [1,4]
; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>

View File

@ -204,8 +204,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
;
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
@ -431,8 +432,9 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2

View File

@ -260,8 +260,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
;
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@ -554,8 +555,9 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2

View File

@ -161,7 +161,7 @@ define float @t6(<8 x float> *%a0) {
;
; X64-AVX-LABEL: t6:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero
@ -204,7 +204,7 @@ define void @PR43971(<8 x float> *%a0, float *%a1) {
;
; X64-AVX-LABEL: PR43971:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vpermilpd $1, 16(%rdi), %xmm0 # xmm0 = mem[1,0]
; X64-AVX-NEXT: vmovss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1
; X64-AVX-NEXT: vmovss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero
@ -252,7 +252,7 @@ define float @PR43971_1(<8 x float> *%a0) nounwind {
;
; X64-AVX-LABEL: PR43971_1:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero

View File

@ -1836,7 +1836,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: ## imm = 0x160
; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
; FMACALL32_BDVER2-NEXT: vmovsd 56(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x38]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
@ -1846,7 +1847,7 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00]
@ -1856,10 +1857,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
; FMACALL32_BDVER2-NEXT: vmovsd 48(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x30]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@ -1869,10 +1871,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
; FMACALL32_BDVER2-NEXT: vmovsd 40(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x28]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@ -1882,10 +1885,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
; FMACALL32_BDVER2-NEXT: vmovsd 32(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x20]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
@ -1902,10 +1906,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
; FMACALL32_BDVER2-NEXT: vmovsd 24(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x18]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@ -1914,10 +1919,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
; FMACALL32_BDVER2-NEXT: vmovsd 16(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x10]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30]
; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@ -1927,10 +1933,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
; FMACALL32_BDVER2-NEXT: vmovsd 8(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x08]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20]
; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@ -1940,8 +1947,9 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovsd 64(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x40]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload

View File

@ -376,7 +376,7 @@ define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
; AVX1-LABEL: arg_i64_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;

View File

@ -1244,57 +1244,77 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
;
; AVX1-LABEL: gather_v8i32_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: movl $c, %eax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
; AVX1-NEXT: movl $c, %ecx
; AVX1-NEXT: vmovq %rcx, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm9, %xmm5
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm9, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: # implicit-def: $ymm1
; AVX1-NEXT: je .LBB4_2
; AVX1-NEXT: # %bb.1: # %cond.load
; AVX1-NEXT: vmovq %xmm4, %rcx
; AVX1-NEXT: vmovd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovq %xmm3, %rdx
; AVX1-NEXT: vmovd (%rdx), %xmm1 # xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: .LBB4_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_4
; AVX1-NEXT: # %bb.3: # %cond.load1
; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm5
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: vpextrq $1, %xmm3, %rdx
; AVX1-NEXT: vpinsrd $1, (%rdx), %xmm1, %xmm4
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: .LBB4_4: # %else2
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
; AVX1-NEXT: jne .LBB4_5
; AVX1-NEXT: # %bb.6: # %else5
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: je .LBB4_6
; AVX1-NEXT: # %bb.5: # %cond.load4
; AVX1-NEXT: vmovq %xmm4, %rdx
; AVX1-NEXT: vpinsrd $2, (%rdx), %xmm1, %xmm5
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: .LBB4_6: # %else5
; AVX1-NEXT: vmovq %rcx, %xmm5
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: jne .LBB4_7
; AVX1-NEXT: je .LBB4_8
; AVX1-NEXT: # %bb.7: # %cond.load7
; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
; AVX1-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm6
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: .LBB4_8: # %else8
; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0]
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: jne .LBB4_9
; AVX1-NEXT: je .LBB4_10
; AVX1-NEXT: # %bb.9: # %cond.load10
; AVX1-NEXT: vmovq %xmm3, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpinsrd $0, (%rcx), %xmm6, %xmm6
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX1-NEXT: .LBB4_10: # %else11
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: jne .LBB4_11
; AVX1-NEXT: je .LBB4_12
; AVX1-NEXT: # %bb.11: # %cond.load13
; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: .LBB4_12: # %else14
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_14
; AVX1-NEXT: .LBB4_13: # %cond.load16
; AVX1-NEXT: vmovq %xmm6, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: # %bb.13: # %cond.load16
; AVX1-NEXT: vmovq %xmm4, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: .LBB4_14: # %else17
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm5
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_16
; AVX1-NEXT: # %bb.15: # %cond.load19
; AVX1-NEXT: vpextrq $1, %xmm6, %rax
; AVX1-NEXT: vpextrq $1, %xmm4, %rax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@ -1455,33 +1475,6 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB4_5: # %cond.load4
; AVX1-NEXT: vmovq %xmm6, %rcx
; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm5
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB4_8
; AVX1-NEXT: .LBB4_7: # %cond.load7
; AVX1-NEXT: vpextrq $1, %xmm6, %rcx
; AVX1-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm5
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB4_10
; AVX1-NEXT: .LBB4_9: # %cond.load10
; AVX1-NEXT: vmovq %xmm4, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpinsrd $0, (%rcx), %xmm5, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB4_12
; AVX1-NEXT: .LBB4_11: # %cond.load13
; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: jne .LBB4_13
; AVX1-NEXT: jmp .LBB4_14
;
; AVX2-LABEL: gather_v8i32_v8i32:
; AVX2: # %bb.0:

View File

@ -17,116 +17,48 @@ define void @pr34653() {
; CHECK-NEXT: callq test
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm1
; CHECK-NEXT: vmovaps %xmm1, %xmm2
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm2
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm3
; CHECK-NEXT: vmovaps %xmm3, %xmm4
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm4
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm5
; CHECK-NEXT: vmovaps %xmm5, %xmm6
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm6
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm7
; CHECK-NEXT: vmovaps %xmm7, %xmm8
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm8
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm9
; CHECK-NEXT: vmovaps %xmm9, %xmm10
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm10
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm11
; CHECK-NEXT: vmovaps %xmm11, %xmm12
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm12
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm13
; CHECK-NEXT: vmovaps %xmm13, %xmm14
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm14
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm15
; CHECK-NEXT: vmovaps %zmm15, %zmm16
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm17
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm18
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm19
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm20
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm21
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm22
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm23
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd {{.*#+}} xmm16 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm17 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm18 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm19 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm20 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm21 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm22 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm23 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload
; CHECK-NEXT: # xmm25 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload
; CHECK-NEXT: # xmm26 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 8-byte Reload
; CHECK-NEXT: # xmm27 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 8-byte Reload
; CHECK-NEXT: # xmm28 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 8-byte Reload
; CHECK-NEXT: # xmm29 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 8-byte Reload
; CHECK-NEXT: # xmm30 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 8-byte Reload
; CHECK-NEXT: # xmm31 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm25 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm26 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm27 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm28 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm29 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm30 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm31 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
@ -142,7 +74,6 @@ define void @pr34653() {
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%v = call fastcc <38 x double> @test()

View File

@ -65,7 +65,6 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_
define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
; KNL: ## %bb.0:
; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8,9,10,11],ymm3[12,13],ymm2[14],ymm3[15]
@ -74,7 +73,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15]
; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u]
; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15]
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm1
; KNL-NEXT: vpbroadcastw %xmm1, %ymm1
; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15]
; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]

View File

@ -1707,24 +1707,22 @@ define void @splat2_v4i64_load_store(<4 x i64>* %s, <8 x i64>* %d) {
define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) {
; AVX1-LABEL: splat4_v8f32_load_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovups (%rdi), %xmm0
; AVX1-NEXT: vmovups 16(%rdi), %xmm1
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1]
; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1]
; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vmovups %xmm0, 48(%rsi)
; AVX1-NEXT: vmovups %xmm7, 32(%rsi)
; AVX1-NEXT: vmovups %xmm6, 16(%rsi)
; AVX1-NEXT: vmovups %xmm5, (%rsi)
; AVX1-NEXT: vmovups %xmm1, 112(%rsi)
; AVX1-NEXT: vmovups %xmm4, 96(%rsi)
; AVX1-NEXT: vmovups %xmm3, 80(%rsi)
; AVX1-NEXT: vmovups %xmm2, 64(%rsi)
; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0
; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1
; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2
; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3
; AVX1-NEXT: vbroadcastss (%rdi), %xmm4
; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5
; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6
; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7
; AVX1-NEXT: vmovups %xmm7, 48(%rsi)
; AVX1-NEXT: vmovups %xmm6, 32(%rsi)
; AVX1-NEXT: vmovups %xmm5, 16(%rsi)
; AVX1-NEXT: vmovups %xmm4, (%rsi)
; AVX1-NEXT: vmovups %xmm3, 112(%rsi)
; AVX1-NEXT: vmovups %xmm2, 96(%rsi)
; AVX1-NEXT: vmovups %xmm1, 80(%rsi)
; AVX1-NEXT: vmovups %xmm0, 64(%rsi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat4_v8f32_load_store:
@ -1768,24 +1766,23 @@ define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) {
define void @splat4_v8i32_load_store(<8 x i32>* %s, <32 x i32>* %d) {
; AVX1-LABEL: splat4_v8i32_load_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovups (%rdi), %xmm0
; AVX1-NEXT: vmovups 16(%rdi), %xmm1
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1]
; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1]
; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX1-NEXT: vmovups %xmm0, 48(%rsi)
; AVX1-NEXT: vmovups %xmm7, 32(%rsi)
; AVX1-NEXT: vmovups %xmm6, 16(%rsi)
; AVX1-NEXT: vmovups %xmm5, (%rsi)
; AVX1-NEXT: vmovups %xmm1, 112(%rsi)
; AVX1-NEXT: vmovups %xmm4, 96(%rsi)
; AVX1-NEXT: vmovups %xmm3, 80(%rsi)
; AVX1-NEXT: vmovups %xmm2, 64(%rsi)
; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm1
; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm2
; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm3
; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: vmovups %ymm3, 96(%rsi)
; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
; AVX1-NEXT: vmovups %ymm1, 32(%rsi)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat4_v8i32_load_store: