From 4bc6f63320289e280fd848d163ada995f5fe679b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 28 Feb 2020 15:18:08 +0000 Subject: [PATCH] [TargetLowering] SimplifyDemandedBits - fix SCALAR_TO_VECTOR knownbits bug We can only report the knownbits for a SCALAR_TO_VECTOR node if we only demand the 0'th element - the upper elements are undefined and shouldn't be trusted. This is causing a number of regressions that need addressing but we need to get the bugfix in first. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 6 +- llvm/test/CodeGen/PowerPC/pre-inc-disable.ll | 24 ++-- llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll | 96 +++++++------ llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll | 66 +++++---- .../X86/broadcast-elm-cross-splat-vec.ll | 14 +- ...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 60 ++++---- llvm/test/CodeGen/X86/insertelement-ones.ll | 79 ++++++---- llvm/test/CodeGen/X86/load-partial.ll | 16 ++- llvm/test/CodeGen/X86/pr30562.ll | 12 +- llvm/test/CodeGen/X86/sse3.ll | 6 +- llvm/test/CodeGen/X86/vector-mul.ll | 136 +++++++++++------- llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 3 +- llvm/test/CodeGen/X86/vector-trunc-math.ll | 15 +- 13 files changed, 321 insertions(+), 212 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 6dc99e761656..9f2e453907c9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -875,7 +875,11 @@ bool TargetLowering::SimplifyDemandedBits( APInt SrcDemandedBits = DemandedBits.zextOrSelf(SrcBitWidth); if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcKnown, TLO, Depth + 1)) return true; - Known = SrcKnown.anyextOrTrunc(BitWidth); + + // Upper elements are undef, so only get the knownbits if we just demand + // the bottom element. + if (DemandedElts == 1) + Known = SrcKnown.anyextOrTrunc(BitWidth); break; } case ISD::BUILD_VECTOR: diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll index 60df77fc0970..a77ad21105df 100644 --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -356,22 +356,24 @@ define void @test16(i16* nocapture readonly %sums, i32 signext %delta, i32 signe ; CHECK-LABEL: test16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sldi r4, r4, 1 -; CHECK-NEXT: lxsihzx v2, r3, r4 -; CHECK-NEXT: vsplth v2, v2, 3 -; CHECK-NEXT: xxlxor v3, v3, v3 -; CHECK-NEXT: vmrglh v2, v3, v2 -; CHECK-NEXT: vsplth v4, v3, 7 ; CHECK-NEXT: add r6, r3, r4 -; CHECK-NEXT: li r3, 16 -; CHECK-NEXT: vmrglw v2, v2, v4 -; CHECK-NEXT: lxsihzx v4, r6, r3 +; CHECK-NEXT: li r7, 16 +; CHECK-NEXT: lxsihzx v2, r6, r7 +; CHECK-NEXT: lxsihzx v4, r3, r4 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: mtvsrd f0, r6 +; CHECK-NEXT: vsplth v4, v4, 3 +; CHECK-NEXT: xxswapd v3, vs0 +; CHECK-NEXT: vsplth v2, v2, 3 ; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha ; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-NEXT: vsplth v4, v4, 3 -; CHECK-NEXT: vmrglh v3, v3, v4 +; CHECK-NEXT: vmrglh v4, v3, v4 +; CHECK-NEXT: vmrglh v2, v3, v2 +; CHECK-NEXT: vsplth v3, v3, 7 +; CHECK-NEXT: vmrglw v3, v4, v3 ; CHECK-NEXT: lxvx v4, 0, r3 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: vperm v2, v3, v2, v4 +; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: xxspltw v3, v2, 2 ; CHECK-NEXT: vadduwm v2, v2, v3 ; CHECK-NEXT: vextuwrx r3, r3, v2 diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll index d795f6b62fab..e40c348fcb87 100644 --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -1071,7 +1071,6 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P9LE-NEXT: extsw r4, r4 ; P9LE-NEXT: mulld r5, r4, r5 ; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: add r4, r5, r4 ; P9LE-NEXT: srwi r5, r4, 31 ; P9LE-NEXT: srawi r4, r4, 9 @@ -1080,6 +1079,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P9LE-NEXT: mulli r4, r4, 654 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: extsh r4, r3 @@ -1094,7 +1096,7 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P9LE-NEXT: lis r5, 24749 ; P9LE-NEXT: mulli r4, r4, 23 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -1179,7 +1181,6 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P8LE-NEXT: lis r3, 24749 ; P8LE-NEXT: lis r8, -19946 ; P8LE-NEXT: lis r10, -14230 -; P8LE-NEXT: xxlxor v5, v5, v5 ; P8LE-NEXT: ori r3, r3, 47143 ; P8LE-NEXT: ori r8, r8, 17097 ; P8LE-NEXT: mfvsrd r4, f0 @@ -1212,18 +1213,21 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P8LE-NEXT: mulli r3, r3, 5423 ; P8LE-NEXT: add r7, r7, r9 ; P8LE-NEXT: mulli r8, r8, 23 +; P8LE-NEXT: li r9, 0 ; P8LE-NEXT: mulli r7, r7, 654 +; P8LE-NEXT: mtvsrd f0, r9 ; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r8, r6 -; P8LE-NEXT: subf r4, r7, r4 +; P8LE-NEXT: xxswapd v4, vs0 +; P8LE-NEXT: subf r5, r8, r6 ; P8LE-NEXT: mtvsrd f1, r3 -; P8LE-NEXT: mtvsrd f2, r4 -; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: xxswapd v3, vs1 -; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: subf r3, r7, r4 +; P8LE-NEXT: mtvsrd f2, r5 +; P8LE-NEXT: mtvsrd f3, r3 +; P8LE-NEXT: xxswapd v2, vs1 +; P8LE-NEXT: xxswapd v3, vs2 +; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v2, v3 -; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglh v3, v5, v4 ; P8LE-NEXT: vmrglw v2, v2, v3 ; P8LE-NEXT: blr ; @@ -1328,9 +1332,11 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr @@ -1388,47 +1394,49 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; P8LE-LABEL: dont_fold_urem_i16_smax: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r6, 24749 -; P8LE-NEXT: lis r7, -19946 -; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: ori r6, r6, 47143 -; P8LE-NEXT: ori r7, r7, 17097 -; P8LE-NEXT: mfvsrd r3, f0 -; P8LE-NEXT: rldicl r4, r3, 16, 48 -; P8LE-NEXT: rldicl r5, r3, 32, 48 -; P8LE-NEXT: extsh r8, r4 -; P8LE-NEXT: extsh r9, r5 -; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: lis r3, 24749 +; P8LE-NEXT: lis r8, -19946 +; P8LE-NEXT: ori r3, r3, 47143 +; P8LE-NEXT: ori r8, r8, 17097 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r5, r4, 16, 48 +; P8LE-NEXT: rldicl r6, r4, 32, 48 +; P8LE-NEXT: extsh r7, r5 +; P8LE-NEXT: extsh r9, r6 +; P8LE-NEXT: extsw r7, r7 ; P8LE-NEXT: extsw r9, r9 -; P8LE-NEXT: mulld r6, r8, r6 -; P8LE-NEXT: mulld r7, r9, r7 -; P8LE-NEXT: rldicl r3, r3, 48, 48 -; P8LE-NEXT: rldicl r8, r6, 32, 32 +; P8LE-NEXT: mulld r3, r7, r3 +; P8LE-NEXT: mulld r7, r9, r8 +; P8LE-NEXT: rldicl r4, r4, 48, 48 +; P8LE-NEXT: rldicl r8, r3, 1, 63 +; P8LE-NEXT: rldicl r3, r3, 32, 32 ; P8LE-NEXT: rldicl r7, r7, 32, 32 -; P8LE-NEXT: rldicl r6, r6, 1, 63 -; P8LE-NEXT: srawi r8, r8, 11 +; P8LE-NEXT: srawi r3, r3, 11 ; P8LE-NEXT: add r7, r7, r9 -; P8LE-NEXT: add r6, r8, r6 +; P8LE-NEXT: add r3, r3, r8 +; P8LE-NEXT: li r9, 0 ; P8LE-NEXT: srwi r8, r7, 31 ; P8LE-NEXT: srawi r7, r7, 4 -; P8LE-NEXT: mulli r6, r6, 5423 +; P8LE-NEXT: mtvsrd f0, r9 +; P8LE-NEXT: mulli r3, r3, 5423 ; P8LE-NEXT: add r7, r7, r8 -; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: extsh r8, r4 ; P8LE-NEXT: mulli r7, r7, 23 ; P8LE-NEXT: srawi r8, r8, 15 -; P8LE-NEXT: subf r4, r6, r4 -; P8LE-NEXT: addze r6, r8 -; P8LE-NEXT: mtvsrd f0, r4 -; P8LE-NEXT: slwi r4, r6, 15 -; P8LE-NEXT: subf r5, r7, r5 -; P8LE-NEXT: subf r3, r4, r3 -; P8LE-NEXT: mtvsrd f1, r5 -; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f2, r3 -; P8LE-NEXT: xxswapd v3, vs1 -; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v4, vs0 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: addze r5, r8 +; P8LE-NEXT: slwi r5, r5, 15 +; P8LE-NEXT: subf r6, r7, r6 +; P8LE-NEXT: mtvsrd f1, r3 +; P8LE-NEXT: subf r3, r5, r4 +; P8LE-NEXT: mtvsrd f2, r6 +; P8LE-NEXT: mtvsrd f3, r3 +; P8LE-NEXT: xxswapd v2, vs1 +; P8LE-NEXT: xxswapd v3, vs2 +; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v2, v3 -; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglh v3, v5, v4 ; P8LE-NEXT: vmrglw v2, v2, v3 ; P8LE-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll index e3d9027d9e98..e1d051d6f3cf 100644 --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -1006,9 +1006,11 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr @@ -1064,41 +1066,43 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 ; P8LE-NEXT: li r3, 0 -; P8LE-NEXT: lis r8, 24749 -; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: oris r5, r3, 45590 -; P8LE-NEXT: ori r8, r8, 47143 -; P8LE-NEXT: oris r3, r3, 51306 -; P8LE-NEXT: ori r5, r5, 17097 -; P8LE-NEXT: ori r3, r3, 30865 +; P8LE-NEXT: lis r9, 24749 +; P8LE-NEXT: oris r5, r3, 51306 +; P8LE-NEXT: oris r3, r3, 45590 +; P8LE-NEXT: ori r9, r9, 47143 +; P8LE-NEXT: ori r5, r5, 30865 +; P8LE-NEXT: ori r3, r3, 17097 ; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r7, r4, 16, 48 -; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: mulld r5, r9, r5 -; P8LE-NEXT: rlwinm r9, r7, 0, 16, 31 -; P8LE-NEXT: mulld r8, r9, r8 -; P8LE-NEXT: rlwinm r9, r4, 31, 17, 31 -; P8LE-NEXT: mulld r3, r9, r3 -; P8LE-NEXT: rldicl r5, r5, 28, 36 -; P8LE-NEXT: rldicl r8, r8, 21, 43 -; P8LE-NEXT: mulli r5, r5, 23 -; P8LE-NEXT: rldicl r3, r3, 24, 40 -; P8LE-NEXT: mulli r8, r8, 5423 -; P8LE-NEXT: mulli r3, r3, 654 -; P8LE-NEXT: subf r5, r5, r6 -; P8LE-NEXT: subf r6, r8, r7 -; P8LE-NEXT: mtvsrd f0, r5 -; P8LE-NEXT: subf r3, r3, r4 -; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: rlwinm r8, r6, 31, 17, 31 +; P8LE-NEXT: rldicl r4, r4, 16, 48 +; P8LE-NEXT: mulld r5, r8, r5 +; P8LE-NEXT: rlwinm r8, r7, 0, 16, 31 +; P8LE-NEXT: mulld r3, r8, r3 +; P8LE-NEXT: rlwinm r8, r4, 0, 16, 31 +; P8LE-NEXT: mulld r8, r8, r9 +; P8LE-NEXT: li r9, 0 +; P8LE-NEXT: mtvsrd f0, r9 +; P8LE-NEXT: rldicl r5, r5, 24, 40 +; P8LE-NEXT: rldicl r3, r3, 28, 36 +; P8LE-NEXT: mulli r5, r5, 654 ; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: rldicl r8, r8, 21, 43 +; P8LE-NEXT: mulli r3, r3, 23 +; P8LE-NEXT: mulli r8, r8, 5423 +; P8LE-NEXT: subf r5, r5, r6 +; P8LE-NEXT: subf r3, r3, r7 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: subf r4, r8, r4 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f3, r4 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v3, v2 -; P8LE-NEXT: vmrglh v3, v4, v5 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_one: diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 82ff94e868d3..df3880e3d0c2 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1400,7 +1400,9 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) { ; AVX-64-LABEL: f4xi64_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm2 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1458,7 +1460,9 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; AVX-64-LABEL: f8xi64_i128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm3 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1466,7 +1470,7 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1] ; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -1535,7 +1539,9 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm4 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 99799d17d215..208d84010763 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -560,17 +560,18 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.LCPI.*}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -586,17 +587,18 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq @@ -656,17 +658,18 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl $1, %eax +; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.LCPI.*}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -682,17 +685,18 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: movd %eax, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll index 19659ed5377c..ab95e18faea1 100644 --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -319,30 +319,39 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) { define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { ; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE3: # %bb.0: +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: movl $255, %eax -; SSE3-NEXT: movd %eax, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: pandn %xmm2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] -; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero -; SSSE3-NEXT: por {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: movl $255, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero +; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -367,45 +376,61 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { ; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE3: # %bb.0: +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movl $255, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: pandn %xmm3, %xmm2 ; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; SSE3-NEXT: pand %xmm2, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] -; SSE3-NEXT: por %xmm3, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: por {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] +; SSE3-NEXT: por %xmm4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE3-NEXT: pand %xmm5, %xmm1 +; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; SSE3-NEXT: pandn %xmm3, %xmm5 +; SSE3-NEXT: por %xmm5, %xmm1 ; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm3, %xmm1 +; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSSE3-NEXT: movl $255, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSSE3-NEXT: por %xmm0, %xmm2 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15] -; SSSE3-NEXT: por {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll index 4e1014fa28a7..4a0a52903d0b 100644 --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -307,17 +307,25 @@ define i32 @load_partial_illegal_type() { ; SSE2: # %bb.0: ; SSE2-NEXT: movzwl {{.*}}(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movl $2, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_partial_illegal_type: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movzwl {{.*}}(%rip), %eax ; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movl $2, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5,6,7,8,9,10,11,12,13,14,15] -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr30562.ll b/llvm/test/CodeGen/X86/pr30562.ll index 05d5c09d55b9..24cbf10ed53d 100644 --- a/llvm/test/CodeGen/X86/pr30562.ll +++ b/llvm/test/CodeGen/X86/pr30562.ll @@ -6,18 +6,20 @@ define i32 @foo(i64* nocapture %perm, i32 %n) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: andl $1, %ecx -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [2,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -24(%rsp,%rcx,8), %rdx -; CHECK-NEXT: movups %xmm0, (%rdi,%rdx,8) +; CHECK-NEXT: movdqu %xmm0, (%rdi,%rdx,8) ; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %exit ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll index b657a42445b8..09a0b865356a 100644 --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -12,14 +12,16 @@ define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; X86-NEXT: movl $1, %edx +; X86-NEXT: movd %edx, %xmm0 ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: t0: ; X64: # %bb.0: # %entry -; X64-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; X64-NEXT: movl $1, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 805ff9f69ed5..b8ace4effe88 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -1531,7 +1531,9 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; ; X64-LABEL: mul_v2i64_0_1: ; X64: # %bb.0: -; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; X64-NEXT: movl $1, %eax +; X64-NEXT: movq %rax, %xmm1 +; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pmuludq %xmm1, %xmm2 ; X64-NEXT: psrlq $32, %xmm0 @@ -1542,7 +1544,9 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; ; X64-XOP-LABEL: mul_v2i64_0_1: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; X64-XOP-NEXT: movl $1, %eax +; X64-XOP-NEXT: vmovq %rax, %xmm1 +; X64-XOP-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1552,7 +1556,9 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_0_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; X64-AVX2-NEXT: movl $1, %eax +; X64-AVX2-NEXT: vmovq %rax, %xmm1 +; X64-AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1562,7 +1568,10 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; ; X64-AVX512DQ-LABEL: mul_v2i64_0_1: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: movl $1, %eax +; X64-AVX512DQ-NEXT: vmovq %rax, %xmm1 +; X64-AVX512DQ-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX512DQ-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <2 x i64> %a0, ret <2 x i64> %1 @@ -1586,45 +1595,62 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { ; ; X64-LABEL: mul_v2i64_neg_0_1: ; X64: # %bb.0: -; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psrlq $32, %xmm3 -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: movq $-1, %rax +; X64-NEXT: movq %rax, %xmm2 +; X64-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; X64-NEXT: pmuludq %xmm2, %xmm1 +; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; X64-NEXT: pmuludq %xmm0, %xmm3 +; X64-NEXT: paddq %xmm1, %xmm3 +; X64-NEXT: psllq $32, %xmm3 +; X64-NEXT: pmuludq %xmm2, %xmm0 ; X64-NEXT: paddq %xmm3, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_neg_0_1: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; X64-XOP-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm1 +; X64-XOP-NEXT: movq $-1, %rax +; X64-XOP-NEXT: vmovq %rax, %xmm2 +; X64-XOP-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; X64-XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; X64-XOP-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-XOP-NEXT: vmovq %rax, %xmm3 +; X64-XOP-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; X64-XOP-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; X64-XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X64-XOP-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v2i64_neg_0_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm1 +; X64-AVX2-NEXT: movq $-1, %rax +; X64-AVX2-NEXT: vmovq %rax, %xmm2 +; X64-AVX2-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; X64-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-AVX2-NEXT: vmovq %rax, %xmm3 +; X64-AVX2-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: movq $-1, %rax +; X64-AVX512DQ-NEXT: vmovq %rax, %xmm1 +; X64-AVX512DQ-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX512DQ-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <2 x i64> %a0, ret <2 x i64> %1 @@ -1648,40 +1674,48 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { ; ; X64-LABEL: mul_v2i64_15_neg_63: ; X64: # %bb.0: -; X64-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psrlq $32, %xmm3 -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [15,18446744073709551553] +; X64-NEXT: pmuludq %xmm2, %xmm1 +; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; X64-NEXT: pmuludq %xmm0, %xmm3 +; X64-NEXT: paddq %xmm1, %xmm3 +; X64-NEXT: psllq $32, %xmm3 +; X64-NEXT: pmuludq %xmm2, %xmm0 ; X64-NEXT: paddq %xmm3, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_15_neg_63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553] -; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; X64-XOP-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm1 +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [15,18446744073709551553] +; X64-XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; X64-XOP-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-XOP-NEXT: vmovq %rax, %xmm3 +; X64-XOP-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; X64-XOP-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; X64-XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X64-XOP-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; X64-XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; X64-XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v2i64_15_neg_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553] -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm1 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,18446744073709551553] +; X64-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X64-AVX2-NEXT: vmovq %rax, %xmm3 +; X64-AVX2-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; X64-AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 94b00fbd937e..be6ece76420c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -46,7 +46,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: movq $-1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 38cd2a3ae968..18463a993c90 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2352,8 +2352,11 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; @@ -2505,7 +2508,10 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v16i64_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: movq %rax, %xmm8 +; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] +; SSE-NEXT: pmuludq %xmm8, %xmm0 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2 ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3 @@ -2533,7 +2539,10 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm8 +; AVX1-NEXT: movl $1, %eax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm5