From 064cd2ecdb3d0c52be5b6cf4fc67125baa714d3a Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 20 Feb 2020 13:41:56 -0500 Subject: [PATCH] [x86] allow peeking through an extract_subvector to find a splatted operand The motivating case is seen in "splat4_v8f32_load_store" and based on code in PR42024: https://bugs.llvm.org/show_bug.cgi?id=42024 (I haven't stepped through the v8i32 sibling test yet to see why that diverged.) There are other potential improvements visible like allowing scalarization or vector narrowing. Differential Revision: https://reviews.llvm.org/D74909 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 15 +++ .../CodeGen/X86/avx-intrinsics-fast-isel.ll | 2 +- llvm/test/CodeGen/X86/avx-splat.ll | 2 +- .../X86/avx512-shuffles/partial_permute.ll | 28 ++-- .../X86/bitcast-int-to-vector-bool-sext.ll | 10 +- .../X86/bitcast-int-to-vector-bool-zext.ll | 10 +- llvm/test/CodeGen/X86/extractelement-load.ll | 6 +- llvm/test/CodeGen/X86/fma.ll | 40 +++--- .../CodeGen/X86/insertelement-var-index.ll | 2 +- llvm/test/CodeGen/X86/masked_gather.ll | 97 +++++++------- llvm/test/CodeGen/X86/pr34653.ll | 123 ++++-------------- .../CodeGen/X86/vector-shuffle-512-v32.ll | 3 +- .../CodeGen/X86/x86-interleaved-access.ll | 69 +++++----- 13 files changed, 173 insertions(+), 234 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 981d27969a95..8dfb94b1328f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12867,6 +12867,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. + // TODO: Combine this logic with findEltLoadSrc() used by + // EltsFromConsecutiveLoads(). int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { @@ -12882,6 +12884,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, BitOffset %= OpBitWidth; continue; } + case ISD::EXTRACT_SUBVECTOR: { + auto *ConstantIdx = dyn_cast(V.getOperand(1)); + if (!ConstantIdx) + break; + + // The extraction index adds to the existing offset. + unsigned EltBitWidth = V.getScalarValueSizeInBits(); + unsigned Idx = ConstantIdx->getZExtValue(); + unsigned BeginOffset = Idx * EltBitWidth; + BitOffset += BeginOffset; + V = V.getOperand(0); + continue; + } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast(V.getOperand(2)); diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 1e6d0e8766b3..746b068cec2f 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1965,7 +1965,7 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind { ; X64-LABEL: test_mm256_set1_epi64x: ; X64: # %bb.0: ; X64-NEXT: vmovq %rdi, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index 3751fccb09d4..26fefb26fff4 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -34,7 +34,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { ; X64-LABEL: funcC: ; X64: # %bb.0: # %entry ; X64-NEXT: vmovq %rdi, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 198694a30143..8bb063a8738d 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -2161,11 +2161,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] -; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,3,4,2,0,3,4] +; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2176,11 +2176,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -4351,17 +4350,13 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x doub ret <2 x double> %res } -; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD. define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,4,1,4,1,4,1,4] -; CHECK-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2 +; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4370,16 +4365,13 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double ret <2 x double> %res } -; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD. define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [1,4] +; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index aa71d3749cae..0608d1809ed6 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -204,8 +204,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX1-LABEL: ext_i4_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -431,8 +432,9 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; ; AVX1-LABEL: ext_i8_8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index af1abe71e2f2..9eec82cd6652 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -260,8 +260,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX1-LABEL: ext_i4_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -554,8 +555,9 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; ; AVX1-LABEL: ext_i8_8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 6b5dd7ee3f10..332fea81adff 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -161,7 +161,7 @@ define float @t6(<8 x float> *%a0) { ; ; X64-AVX-LABEL: t6: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3] +; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 ; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero @@ -204,7 +204,7 @@ define void @PR43971(<8 x float> *%a0, float *%a1) { ; ; X64-AVX-LABEL: PR43971: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vpermilpd $1, 16(%rdi), %xmm0 # xmm0 = mem[1,0] +; X64-AVX-NEXT: vmovss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero @@ -252,7 +252,7 @@ define float @PR43971_1(<8 x float> *%a0) nounwind { ; ; X64-AVX-LABEL: PR43971_1: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3] +; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 ; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index 78ee863d19f9..a687bfd43fa6 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -1836,7 +1836,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: ## imm = 0x160 ; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] +; FMACALL32_BDVER2-NEXT: vmovsd 56(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x38] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] @@ -1846,7 +1847,7 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00] @@ -1856,10 +1857,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovsd 48(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1869,10 +1871,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovsd 40(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x28] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1882,10 +1885,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovsd 32(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] @@ -1902,10 +1906,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovsd 24(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x18] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1914,10 +1919,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovsd 16(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1927,10 +1933,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovsd 8(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x08] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1940,8 +1947,9 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] -; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovsd 64(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x40] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll index a37fe63944de..564c789c9880 100644 --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -376,7 +376,7 @@ define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind { ; AVX1-LABEL: arg_i64_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll index 138b33d55b6a..44dcf90c07b5 100644 --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1244,57 +1244,77 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; ; AVX1-LABEL: gather_v8i32_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $c, %eax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4 +; AVX1-NEXT: movl $c, %ecx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm9, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm9, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: # implicit-def: $ymm1 ; AVX1-NEXT: je .LBB4_2 ; AVX1-NEXT: # %bb.1: # %cond.load -; AVX1-NEXT: vmovq %xmm4, %rcx -; AVX1-NEXT: vmovd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovq %xmm3, %rdx +; AVX1-NEXT: vmovd (%rdx), %xmm1 # xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: .LBB4_2: # %else ; AVX1-NEXT: testb $2, %al ; AVX1-NEXT: je .LBB4_4 ; AVX1-NEXT: # %bb.3: # %cond.load1 -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx -; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpextrq $1, %xmm3, %rdx +; AVX1-NEXT: vpinsrd $1, (%rdx), %xmm1, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: .LBB4_4: # %else2 ; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: jne .LBB4_5 -; AVX1-NEXT: # %bb.6: # %else5 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: je .LBB4_6 +; AVX1-NEXT: # %bb.5: # %cond.load4 +; AVX1-NEXT: vmovq %xmm4, %rdx +; AVX1-NEXT: vpinsrd $2, (%rdx), %xmm1, %xmm5 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: .LBB4_6: # %else5 +; AVX1-NEXT: vmovq %rcx, %xmm5 ; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: jne .LBB4_7 +; AVX1-NEXT: je .LBB4_8 +; AVX1-NEXT: # %bb.7: # %cond.load7 +; AVX1-NEXT: vpextrq $1, %xmm4, %rcx +; AVX1-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: .LBB4_8: # %else8 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] ; AVX1-NEXT: testb $16, %al -; AVX1-NEXT: jne .LBB4_9 +; AVX1-NEXT: je .LBB4_10 +; AVX1-NEXT: # %bb.9: # %cond.load10 +; AVX1-NEXT: vmovq %xmm3, %rcx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpinsrd $0, (%rcx), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-NEXT: .LBB4_10: # %else11 ; AVX1-NEXT: testb $32, %al -; AVX1-NEXT: jne .LBB4_11 +; AVX1-NEXT: je .LBB4_12 +; AVX1-NEXT: # %bb.11: # %cond.load13 +; AVX1-NEXT: vpextrq $1, %xmm3, %rcx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: .LBB4_12: # %else14 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_14 -; AVX1-NEXT: .LBB4_13: # %cond.load16 -; AVX1-NEXT: vmovq %xmm6, %rcx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: # %bb.13: # %cond.load16 +; AVX1-NEXT: vmovq %xmm4, %rcx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: .LBB4_14: # %else17 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm5 ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_16 ; AVX1-NEXT: # %bb.15: # %cond.load19 -; AVX1-NEXT: vpextrq $1, %xmm6, %rax +; AVX1-NEXT: vpextrq $1, %xmm4, %rax ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 @@ -1455,33 +1475,6 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq -; AVX1-NEXT: .LBB4_5: # %cond.load4 -; AVX1-NEXT: vmovq %xmm6, %rcx -; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: je .LBB4_8 -; AVX1-NEXT: .LBB4_7: # %cond.load7 -; AVX1-NEXT: vpextrq $1, %xmm6, %rcx -; AVX1-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm5 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: testb $16, %al -; AVX1-NEXT: je .LBB4_10 -; AVX1-NEXT: .LBB4_9: # %cond.load10 -; AVX1-NEXT: vmovq %xmm4, %rcx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpinsrd $0, (%rcx), %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-NEXT: testb $32, %al -; AVX1-NEXT: je .LBB4_12 -; AVX1-NEXT: .LBB4_11: # %cond.load13 -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: testb $64, %al -; AVX1-NEXT: jne .LBB4_13 -; AVX1-NEXT: jmp .LBB4_14 ; ; AVX2-LABEL: gather_v8i32_v8i32: ; AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/pr34653.ll b/llvm/test/CodeGen/X86/pr34653.ll index 72843a5052be..9a0b56a90cb7 100644 --- a/llvm/test/CodeGen/X86/pr34653.ll +++ b/llvm/test/CodeGen/X86/pr34653.ll @@ -17,116 +17,48 @@ define void @pr34653() { ; CHECK-NEXT: callq test ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm1 -; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm2 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm3 -; CHECK-NEXT: vmovaps %xmm3, %xmm4 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm4 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm5 -; CHECK-NEXT: vmovaps %xmm5, %xmm6 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm6 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm7 -; CHECK-NEXT: vmovaps %xmm7, %xmm8 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm8 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm9 -; CHECK-NEXT: vmovaps %xmm9, %xmm10 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm10 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm11 -; CHECK-NEXT: vmovaps %xmm11, %xmm12 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm12 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm13 -; CHECK-NEXT: vmovaps %xmm13, %xmm14 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm14 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm15 -; CHECK-NEXT: vmovaps %zmm15, %zmm16 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm17 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm18 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm19 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm20 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm21 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm22 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm23 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm16 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm17 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm18 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm19 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm20 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm21 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm22 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm23 = mem[0],zero ; CHECK-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload -; CHECK-NEXT: # xmm25 = mem[0],zero -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload -; CHECK-NEXT: # xmm26 = mem[0],zero -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 8-byte Reload -; CHECK-NEXT: # xmm27 = mem[0],zero -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 8-byte Reload -; CHECK-NEXT: # xmm28 = mem[0],zero -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 8-byte Reload -; CHECK-NEXT: # xmm29 = mem[0],zero -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 8-byte Reload -; CHECK-NEXT: # xmm30 = mem[0],zero -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 8-byte Reload -; CHECK-NEXT: # xmm31 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm25 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm26 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm27 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm28 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm29 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm30 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm31 = mem[0],zero ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero @@ -142,7 +74,6 @@ define void @pr34653() { ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %v = call fastcc <38 x double> @test() diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index c59d31911ad5..54266b12864f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -65,7 +65,6 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8,9,10,11],ymm3[12,13],ymm2[14],ymm3[15] @@ -74,7 +73,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1 ; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15] ; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u] ; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 ; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15] ; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index fc81e8658538..58cef0725c0d 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1707,24 +1707,22 @@ define void @splat2_v4i64_load_store(<4 x i64>* %s, <8 x i64>* %d) { define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) { ; AVX1-LABEL: splat4_v8f32_load_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovups (%rdi), %xmm0 -; AVX1-NEXT: vmovups 16(%rdi), %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vmovups %xmm0, 48(%rsi) -; AVX1-NEXT: vmovups %xmm7, 32(%rsi) -; AVX1-NEXT: vmovups %xmm6, 16(%rsi) -; AVX1-NEXT: vmovups %xmm5, (%rsi) -; AVX1-NEXT: vmovups %xmm1, 112(%rsi) -; AVX1-NEXT: vmovups %xmm4, 96(%rsi) -; AVX1-NEXT: vmovups %xmm3, 80(%rsi) -; AVX1-NEXT: vmovups %xmm2, 64(%rsi) +; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0 +; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1 +; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2 +; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3 +; AVX1-NEXT: vbroadcastss (%rdi), %xmm4 +; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5 +; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6 +; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7 +; AVX1-NEXT: vmovups %xmm7, 48(%rsi) +; AVX1-NEXT: vmovups %xmm6, 32(%rsi) +; AVX1-NEXT: vmovups %xmm5, 16(%rsi) +; AVX1-NEXT: vmovups %xmm4, (%rsi) +; AVX1-NEXT: vmovups %xmm3, 112(%rsi) +; AVX1-NEXT: vmovups %xmm2, 96(%rsi) +; AVX1-NEXT: vmovups %xmm1, 80(%rsi) +; AVX1-NEXT: vmovups %xmm0, 64(%rsi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat4_v8f32_load_store: @@ -1768,24 +1766,23 @@ define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) { define void @splat4_v8i32_load_store(<8 x i32>* %s, <32 x i32>* %d) { ; AVX1-LABEL: splat4_v8i32_load_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovups (%rdi), %xmm0 -; AVX1-NEXT: vmovups 16(%rdi), %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vmovups %xmm0, 48(%rsi) -; AVX1-NEXT: vmovups %xmm7, 32(%rsi) -; AVX1-NEXT: vmovups %xmm6, 16(%rsi) -; AVX1-NEXT: vmovups %xmm5, (%rsi) -; AVX1-NEXT: vmovups %xmm1, 112(%rsi) -; AVX1-NEXT: vmovups %xmm4, 96(%rsi) -; AVX1-NEXT: vmovups %xmm3, 80(%rsi) -; AVX1-NEXT: vmovups %xmm2, 64(%rsi) +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm1 +; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm2 +; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm3 +; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vmovups %ymm3, 96(%rsi) +; AVX1-NEXT: vmovups %ymm2, 64(%rsi) +; AVX1-NEXT: vmovups %ymm1, 32(%rsi) +; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat4_v8i32_load_store: