From 606eb2367f9f0bef2d1e0182bbb2bf4effb1711e Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 4 Jun 2019 16:40:04 +0000 Subject: [PATCH] [x86] split 256-bit store of concatenated vectors This shows up as a side issue to the main problem for the AVX target example from PR37428: https://bugs.llvm.org/show_bug.cgi?id=37428 - https://godbolt.org/z/7tpRa3 But as we can see in the pile of existing test diffs, it's actually a widespread problem that affects any AVX or later target. Apart from a couple of oddballs, I think these are all improvements for the reasons stated in the code comment: we do not want to enable YMM unnecessarily (avoid vzeroupper and frequency throttling) and some cores split 256-bit stores anyway. We could say that MergeConsecutiveStores() is going overboard on some of these examples, but that won't solve the problem completely. But that is a reason I'm proposing this as a lowering rather than a combine: we will infinite loop fighting the merge code if we try this earlier. Differential Revision: https://reviews.llvm.org/D62498 llvm-svn: 362524 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 11 + llvm/test/CodeGen/X86/avg.ll | 402 +++++++++--------- .../CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 24 +- llvm/test/CodeGen/X86/avx-intrinsics-x86.ll | 12 +- llvm/test/CodeGen/X86/avx512-trunc-widen.ll | 16 +- llvm/test/CodeGen/X86/avx512-trunc.ll | 16 +- llvm/test/CodeGen/X86/nontemporal-2.ll | 40 +- llvm/test/CodeGen/X86/oddsubvector.ll | 15 +- llvm/test/CodeGen/X86/pmovsx-inreg.ll | 72 ++-- llvm/test/CodeGen/X86/shrink_vmul-widen.ll | 124 +++--- llvm/test/CodeGen/X86/shrink_vmul.ll | 124 +++--- .../CodeGen/X86/shuffle-vs-trunc-512-widen.ll | 18 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 18 +- llvm/test/CodeGen/X86/subvector-broadcast.ll | 68 +-- llvm/test/CodeGen/X86/vec_fptrunc.ll | 10 +- llvm/test/CodeGen/X86/vec_saddo.ll | 68 +-- llvm/test/CodeGen/X86/vec_smulo.ll | 84 ++-- llvm/test/CodeGen/X86/vec_ssubo.ll | 84 ++-- llvm/test/CodeGen/X86/vec_uaddo.ll | 24 +- llvm/test/CodeGen/X86/vec_umulo.ll | 26 +- llvm/test/CodeGen/X86/vec_usubo.ll | 24 +- llvm/test/CodeGen/X86/vector-gep.ll | 134 +++--- llvm/test/CodeGen/X86/vector-trunc-widen.ll | 72 ++-- llvm/test/CodeGen/X86/vector-trunc.ll | 72 ++-- .../CodeGen/X86/x86-interleaved-access.ll | 73 ++-- 25 files changed, 786 insertions(+), 845 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e493d3d71941..a15e37538206 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1283,6 +1283,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); } if (HasInt256) @@ -21073,7 +21074,17 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, if (St->isTruncatingStore()) return SDValue(); + // If this is a 256-bit store of concatenated ops, we are better off splitting + // that store into two 128-bit stores. This avoids spurious use of 256-bit ops + // and each half can execute independently. Some cores would split the op into + // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); + if (StoreVT.is256BitVector()) { + if (StoredVal.getOpcode() != ISD::CONCAT_VECTORS || !StoredVal.hasOneUse()) + return SDValue(); + return split256BitStore(St, DAG); + } + assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index cfa9f11a9c73..22a6daa999d7 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -102,11 +102,10 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8: @@ -267,8 +266,8 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] @@ -279,10 +278,10 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -297,52 +296,52 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,0,1] +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm15, %xmm15 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm10, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm0, %xmm12, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm4, %xmm13, %xmm10 -; AVX1-NEXT: vpsubd %xmm4, %xmm11, %xmm11 -; AVX1-NEXT: vpsubd %xmm4, %xmm9, %xmm9 -; AVX1-NEXT: vpsubd %xmm4, %xmm8, %xmm8 -; AVX1-NEXT: vpsubd %xmm4, %xmm15, %xmm13 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpsubd %xmm4, %xmm14, %xmm0 -; AVX1-NEXT: vpsubd %xmm4, %xmm12, %xmm2 -; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm3, %xmm12, %xmm11 +; AVX1-NEXT: vpsubd %xmm3, %xmm10, %xmm10 +; AVX1-NEXT: vpsubd %xmm3, %xmm9, %xmm9 +; AVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm8 +; AVX1-NEXT: vpsubd %xmm3, %xmm15, %xmm12 +; AVX1-NEXT: vpsubd %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpsubd %xmm3, %xmm14, %xmm0 +; AVX1-NEXT: vpsubd %xmm3, %xmm13, %xmm2 +; AVX1-NEXT: vpsubd %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpsubd %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 @@ -353,13 +352,13 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm13, %xmm4 +; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm11, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6 +; AVX1-NEXT: vpsrld $1, %xmm10, %xmm5 +; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6 ; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 @@ -368,13 +367,12 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v48i8: @@ -449,13 +447,12 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1 +; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax) -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vmovdqu %xmm0, (%rax) +; AVX512F-NEXT: vmovdqu %xmm2, (%rax) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v48i8: @@ -507,15 +504,14 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm1 +; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -628,11 +624,10 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16: @@ -685,15 +680,14 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm1 +; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -834,11 +828,10 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_2: @@ -893,13 +886,12 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 ; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpavgb %xmm2, %xmm2, %xmm1 -; AVX1-NEXT: vpavgb %xmm3, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgb %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_2: @@ -1013,11 +1005,10 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_2: @@ -1070,15 +1061,14 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm1 +; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1206,11 +1196,10 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275] ; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_const: @@ -1258,15 +1247,14 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275] ; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_const: @@ -1365,11 +1353,10 @@ define void @avg_v16i16_const(<16 x i16>* %a) nounwind { ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_const: @@ -1416,15 +1403,14 @@ define void @avg_v32i16_const(<32 x i16>* %a) nounwind { ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_const: @@ -1665,100 +1651,96 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX1-NEXT: pushq %rbp ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $96, %rsp +; AVX1-NEXT: subq $32, %rsp ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpavgb 288(%rbp), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpavgb 320(%rbp), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpavgb 352(%rbp), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm13 -; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpavgb 384(%rbp), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm14 -; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-NEXT: vpavgb 416(%rbp), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm15 -; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-NEXT: vpavgb 448(%rbp), %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm12 -; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-NEXT: vpavgb 480(%rbp), %xmm6, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-NEXT: vpavgb 512(%rbp), %xmm7, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-NEXT: vmovdqa 16(%rbp), %xmm0 -; AVX1-NEXT: vmovdqa 32(%rbp), %xmm1 -; AVX1-NEXT: vpavgb 528(%rbp), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 544(%rbp), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 -; AVX1-NEXT: vmovdqa 48(%rbp), %xmm0 -; AVX1-NEXT: vmovdqa 64(%rbp), %xmm1 -; AVX1-NEXT: vpavgb 560(%rbp), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 576(%rbp), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX1-NEXT: vmovdqa 80(%rbp), %xmm0 -; AVX1-NEXT: vmovdqa 96(%rbp), %xmm1 -; AVX1-NEXT: vpavgb 592(%rbp), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 608(%rbp), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 -; AVX1-NEXT: vmovdqa 112(%rbp), %xmm0 -; AVX1-NEXT: vmovdqa 128(%rbp), %xmm1 -; AVX1-NEXT: vpavgb 624(%rbp), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 640(%rbp), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa 144(%rbp), %xmm1 -; AVX1-NEXT: vmovdqa 160(%rbp), %xmm2 -; AVX1-NEXT: vpavgb 656(%rbp), %xmm1, %xmm1 -; AVX1-NEXT: vpavgb 672(%rbp), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovdqa 176(%rbp), %xmm2 -; AVX1-NEXT: vmovdqa 192(%rbp), %xmm3 -; AVX1-NEXT: vpavgb 688(%rbp), %xmm2, %xmm2 -; AVX1-NEXT: vpavgb 704(%rbp), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vmovdqa 208(%rbp), %xmm3 -; AVX1-NEXT: vmovdqa 224(%rbp), %xmm4 -; AVX1-NEXT: vpavgb 720(%rbp), %xmm3, %xmm3 -; AVX1-NEXT: vpavgb 736(%rbp), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vmovdqa 240(%rbp), %xmm4 -; AVX1-NEXT: vpavgb 752(%rbp), %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa 256(%rbp), %xmm11 -; AVX1-NEXT: vpavgb 768(%rbp), %xmm11, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vmovaps %ymm4, 480(%rdi) -; AVX1-NEXT: vmovaps %ymm3, 448(%rdi) -; AVX1-NEXT: vmovaps %ymm2, 416(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 384(%rdi) -; AVX1-NEXT: vmovaps %ymm0, 352(%rdi) -; AVX1-NEXT: vmovaps %ymm10, 320(%rdi) -; AVX1-NEXT: vmovaps %ymm9, 288(%rdi) -; AVX1-NEXT: vmovaps %ymm8, 256(%rdi) -; AVX1-NEXT: vmovaps %ymm7, 224(%rdi) -; AVX1-NEXT: vmovaps %ymm6, 192(%rdi) -; AVX1-NEXT: vmovaps %ymm12, 160(%rdi) -; AVX1-NEXT: vmovaps %ymm15, 128(%rdi) -; AVX1-NEXT: vmovaps %ymm14, 96(%rdi) -; AVX1-NEXT: vmovaps %ymm13, 64(%rdi) -; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, (%rdi) +; AVX1-NEXT: vmovdqa 256(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 768(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 496(%rdi) +; AVX1-NEXT: vmovdqa 240(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 752(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 480(%rdi) +; AVX1-NEXT: vmovdqa 224(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 736(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 464(%rdi) +; AVX1-NEXT: vmovdqa 208(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 720(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 448(%rdi) +; AVX1-NEXT: vmovdqa 192(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 704(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 432(%rdi) +; AVX1-NEXT: vmovdqa 176(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 688(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 416(%rdi) +; AVX1-NEXT: vmovdqa 160(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 672(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 400(%rdi) +; AVX1-NEXT: vmovdqa 144(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 656(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 384(%rdi) +; AVX1-NEXT: vmovdqa 128(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 640(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 368(%rdi) +; AVX1-NEXT: vmovdqa 112(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 624(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 352(%rdi) +; AVX1-NEXT: vmovdqa 96(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 608(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 336(%rdi) +; AVX1-NEXT: vmovdqa 80(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 592(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 320(%rdi) +; AVX1-NEXT: vmovdqa 64(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 576(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 304(%rdi) +; AVX1-NEXT: vmovdqa 48(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 560(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 288(%rdi) +; AVX1-NEXT: vmovdqa 32(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 544(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 272(%rdi) +; AVX1-NEXT: vmovdqa 16(%rbp), %xmm8 +; AVX1-NEXT: vpavgb 528(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 256(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpavgb 512(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa %xmm8, 240(%rdi) +; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa %xmm7, 224(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-NEXT: vpavgb 480(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa %xmm7, 208(%rdi) +; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm6 +; AVX1-NEXT: vmovdqa %xmm6, 192(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpavgb 448(%rbp), %xmm6, %xmm6 +; AVX1-NEXT: vmovdqa %xmm6, 176(%rdi) +; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa %xmm5, 160(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpavgb 416(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa %xmm5, 144(%rdi) +; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa %xmm4, 128(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpavgb 384(%rbp), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa %xmm4, 112(%rdi) +; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, 96(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpavgb 352(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, 80(%rdi) +; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpavgb 320(%rbp), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, 48(%rdi) +; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpavgb 288(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index 8f0ec5030eb0..9706bf3455fe 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -725,12 +725,12 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { ; X86-AVX-LABEL: test_x86_avx_storeu_dq_256: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 # encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; X86-AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2] -; X86-AVX-NEXT: vpsubb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xf8,0xca] -; X86-AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc2] -; X86-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; X86-AVX-NEXT: vmovups %ymm0, (%eax) # encoding: [0xc5,0xfc,0x11,0x00] +; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm2 # encoding: [0xc5,0xf9,0xf8,0xd1] +; X86-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc1] +; X86-AVX-NEXT: vmovdqu %xmm0, 16(%eax) # encoding: [0xc5,0xfa,0x7f,0x40,0x10] +; X86-AVX-NEXT: vmovdqu %xmm2, (%eax) # encoding: [0xc5,0xfa,0x7f,0x10] ; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; @@ -745,12 +745,12 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { ; ; X64-AVX-LABEL: test_x86_avx_storeu_dq_256: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 # encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; X64-AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2] -; X64-AVX-NEXT: vpsubb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xf8,0xca] -; X64-AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc2] -; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; X64-AVX-NEXT: vmovups %ymm0, (%rdi) # encoding: [0xc5,0xfc,0x11,0x07] +; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm2 # encoding: [0xc5,0xf9,0xf8,0xd1] +; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc1] +; X64-AVX-NEXT: vmovdqu %xmm0, 16(%rdi) # encoding: [0xc5,0xfa,0x7f,0x47,0x10] +; X64-AVX-NEXT: vmovdqu %xmm2, (%rdi) # encoding: [0xc5,0xfa,0x7f,0x17] ; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll index 2fd2b863859c..8e48289c1042 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -916,8 +916,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X86-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1] -; X86-AVX-NEXT: vmovntdq %ymm0, (%eax) # encoding: [0xc5,0xfd,0xe7,0x00] -; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-AVX-NEXT: vmovntdq %xmm0, (%eax) # encoding: [0xc5,0xf9,0xe7,0x00] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: movnt_dq: @@ -925,24 +924,21 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X86-AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1] -; X86-AVX512VL-NEXT: vmovntdq %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00] -; X86-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-AVX512VL-NEXT: vmovntdq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x00] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: movnt_dq: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X64-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1] -; X64-AVX-NEXT: vmovntdq %ymm0, (%rdi) # encoding: [0xc5,0xfd,0xe7,0x07] -; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi) # encoding: [0xc5,0xf9,0xe7,0x07] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: movnt_dq: ; X64-AVX512VL: # %bb.0: ; X64-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X64-AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1] -; X64-AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x07] -; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX512VL-NEXT: vmovntdq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x07] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %a2 = add <2 x i64> %a1, %a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-trunc-widen.ll b/llvm/test/CodeGen/X86/avx512-trunc-widen.ll index ba451973faa0..1ce08c01773d 100644 --- a/llvm/test/CodeGen/X86/avx512-trunc-widen.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc-widen.ll @@ -462,12 +462,10 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { ; KNL-LABEL: trunc_wb_512_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: vmovdqa %ymm0, (%rdi) +; KNL-NEXT: vpmovdb %zmm1, 16(%rdi) +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -672,8 +670,8 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { ; ALL: ## %bb.0: ; ALL-NEXT: vpmovusdb %zmm0, %xmm0 ; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vmovdqu %ymm0, (%rdi) +; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) +; ALL-NEXT: vmovdqu %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <32 x i32> %i, @@ -952,8 +950,8 @@ define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { ; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 ; ALL-NEXT: vpmovusdb %zmm0, %xmm0 ; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vmovdqu %ymm0, (%rdi) +; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) +; ALL-NEXT: vmovdqu %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <32 x i32> %i, diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll index c15d33222ca0..263f7c90441d 100644 --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -458,12 +458,10 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { ; KNL-LABEL: trunc_wb_512_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: vmovdqa %ymm0, (%rdi) +; KNL-NEXT: vpmovdb %zmm1, 16(%rdi) +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; KNL-NEXT: vpmovdb %zmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -667,8 +665,8 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { ; ALL: ## %bb.0: ; ALL-NEXT: vpmovusdb %zmm0, %xmm0 ; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vmovdqu %ymm0, (%rdi) +; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) +; ALL-NEXT: vmovdqu %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <32 x i32> %i, @@ -948,8 +946,8 @@ define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { ; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 ; ALL-NEXT: vpmovusdb %zmm0, %xmm0 ; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vmovdqu %ymm0, (%rdi) +; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) +; ALL-NEXT: vmovdqu %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <32 x i32> %i, diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll index 5b39cb16afec..aa3e7cda18c0 100644 --- a/llvm/test/CodeGen/X86/nontemporal-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-2.ll @@ -1061,12 +1061,12 @@ define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) { ; ; AVX1-LABEL: test_op_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntdq %xmm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1126,12 +1126,12 @@ define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) { ; ; AVX1-LABEL: test_op_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntdq %xmm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1164,12 +1164,12 @@ define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) { ; ; AVX1-LABEL: test_op_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntdq %xmm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1202,12 +1202,12 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) { ; ; AVX1-LABEL: test_op_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntdq %xmm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index 69ea53e7e9c8..9bc6c0f380a0 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -116,13 +116,14 @@ define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %stru ; ; AVX-LABEL: PR40815: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm1, (%rsi) -; AVX-NEXT: vmovups %ymm0, 32(%rsi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX-NEXT: vmovaps %xmm3, (%rsi) +; AVX-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX-NEXT: vmovaps %xmm1, 32(%rsi) ; AVX-NEXT: retq ; ; AVX512-LABEL: PR40815: diff --git a/llvm/test/CodeGen/X86/pmovsx-inreg.ll b/llvm/test/CodeGen/X86/pmovsx-inreg.ll index 9ab6917966b3..f89223fa4583 100644 --- a/llvm/test/CodeGen/X86/pmovsx-inreg.ll +++ b/llvm/test/CodeGen/X86/pmovsx-inreg.ll @@ -53,12 +53,12 @@ define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind { ; ; AVX1-LABEL: test2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbq (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vpmovsxbq (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovups %ymm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -134,12 +134,12 @@ define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind { ; ; AVX1-LABEL: test4: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovups %ymm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -215,12 +215,12 @@ define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind { ; ; AVX1-LABEL: test6: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbw (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovups %ymm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -296,12 +296,12 @@ define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind { ; ; AVX1-LABEL: test8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwq (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vpmovsxwq (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovups %ymm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -377,12 +377,12 @@ define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind { ; ; AVX1-LABEL: test10: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovups %ymm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -458,12 +458,12 @@ define void @test12(<4 x i32>* %in, <4 x i64>* %out) nounwind { ; ; AVX1-LABEL: test12: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxdq (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vpmovsxdq (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm1 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovups %ymm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll index 0ed79ea4af70..759985800740 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll @@ -215,10 +215,9 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_8xi8: @@ -261,9 +260,8 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_8xi8: @@ -349,12 +347,11 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi8: @@ -417,11 +414,10 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_16xi8: @@ -648,10 +644,9 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_8xi16: @@ -693,9 +688,8 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_8xi16: @@ -780,12 +774,11 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi16: @@ -847,11 +840,10 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_16xi16: @@ -1284,24 +1276,23 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1 -; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2 -; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3 -; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3 ; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi16_sext: @@ -1351,23 +1342,22 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X64-AVX1-LABEL: mul_16xi16_sext: ; X64-AVX1: # %bb.0: # %entry ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0 -; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1 -; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2 -; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3 -; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1 +; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2 +; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3 ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_16xi16_sext: @@ -2201,8 +2191,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-AVX1-NEXT: divl 32(%ecx) ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax -; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm3 -; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm1 +; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1 +; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3 ; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx @@ -2255,14 +2245,13 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax) +; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax) ; X86-AVX1-NEXT: addl $16, %esp ; X86-AVX1-NEXT: popl %esi ; X86-AVX1-NEXT: popl %edi ; X86-AVX1-NEXT: popl %ebx ; X86-AVX1-NEXT: popl %ebp -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: PR34947: @@ -2435,8 +2424,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-AVX1-NEXT: divl 32(%rsi) ; X64-AVX1-NEXT: movl %edx, %r8d ; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax -; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm3 -; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 ; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx @@ -2486,13 +2475,12 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007 ; X64-AVX1-NEXT: movl %eax, (%rax) -; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax) +; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax) ; X64-AVX1-NEXT: popq %rbx ; X64-AVX1-NEXT: popq %rbp -; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: PR34947: diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 0c8949f24617..5e952472f757 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -209,10 +209,9 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_8xi8: @@ -255,9 +254,8 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_8xi8: @@ -343,12 +341,11 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi8: @@ -411,11 +408,10 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_16xi8: @@ -640,10 +636,9 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_8xi16: @@ -685,9 +680,8 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_8xi16: @@ -772,12 +766,11 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi16: @@ -839,11 +832,10 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_16xi16: @@ -1258,24 +1250,23 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1 -; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2 -; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3 -; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3 ; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi16_sext: @@ -1325,23 +1316,22 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X64-AVX1-LABEL: mul_16xi16_sext: ; X64-AVX1: # %bb.0: # %entry ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0 -; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1 -; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2 -; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3 -; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1 +; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2 +; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3 ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) -; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: mul_16xi16_sext: @@ -2157,8 +2147,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-AVX1-NEXT: divl 32(%ecx) ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax -; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm3 -; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm1 +; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1 +; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3 ; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx @@ -2211,14 +2201,13 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax) +; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax) ; X86-AVX1-NEXT: addl $16, %esp ; X86-AVX1-NEXT: popl %esi ; X86-AVX1-NEXT: popl %edi ; X86-AVX1-NEXT: popl %ebx ; X86-AVX1-NEXT: popl %ebp -; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: PR34947: @@ -2391,8 +2380,8 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-AVX1-NEXT: divl 32(%rsi) ; X64-AVX1-NEXT: movl %edx, %r8d ; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax -; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm3 -; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 ; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx @@ -2442,13 +2431,12 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007 ; X64-AVX1-NEXT: movl %eax, (%rax) -; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax) +; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax) ; X64-AVX1-NEXT: popq %rbx ; X64-AVX1-NEXT: popq %rbp -; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: PR34947: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll index 1a6bdd3aaa40..737925eca044 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll @@ -88,23 +88,21 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { ; AVX512F-LABEL: trunc_v32i16_to_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v32i16_to_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi) +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 19031bbb2c0f..6f94e0c60868 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -88,23 +88,21 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { ; AVX512F-LABEL: trunc_v32i16_to_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v32i16_to_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi) +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 3ce584eff2a9..7ecfac5151f2 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -835,24 +835,24 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { ; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X32-AVX1: # %bb.0: # %entry -; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,0,4,0] -; X32-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,0,2,0] +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,2,0] +; X32-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] -; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 -; X32-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 -; X32-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; X32-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; X32-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X32-AVX1-NEXT: vmovups %ymm0, ga4 +; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 +; X32-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 +; X32-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 +; X32-AVX1-NEXT: vmovdqu %xmm0, ga4+16 +; X32-AVX1-NEXT: vmovdqu %xmm4, ga4 ; X32-AVX1-NEXT: vmovups %ymm2, gb4+32 ; X32-AVX1-NEXT: vmovups %ymm1, gb4 ; X32-AVX1-NEXT: vzeroupper @@ -886,24 +886,24 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { ; ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4] -; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2] +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4] ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,2,3,4] -; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 -; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 -; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip) +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,2,3,4] +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 +; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 +; X64-AVX1-NEXT: vmovdqu %xmm0, ga4+{{.*}}(%rip) +; X64-AVX1-NEXT: vmovdqu %xmm4, {{.*}}(%rip) ; X64-AVX1-NEXT: vmovups %ymm2, gb4+{{.*}}(%rip) ; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip) ; X64-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec_fptrunc.ll b/llvm/test/CodeGen/X86/vec_fptrunc.ll index bb6be6cd9e84..e7318d9d6972 100644 --- a/llvm/test/CodeGen/X86/vec_fptrunc.ll +++ b/llvm/test/CodeGen/X86/vec_fptrunc.ll @@ -99,9 +99,8 @@ define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) { ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0 ; X32-AVX-NEXT: vcvtpd2psy 32(%ecx), %xmm1 -; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovups %ymm0, (%eax) -; X32-AVX-NEXT: vzeroupper +; X32-AVX-NEXT: vmovupd %xmm1, 16(%eax) +; X32-AVX-NEXT: vmovupd %xmm0, (%eax) ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: fptrunc_frommem8: @@ -120,9 +119,8 @@ define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) { ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: vcvtpd2psy (%rdi), %xmm0 ; X64-AVX-NEXT: vcvtpd2psy 32(%rdi), %xmm1 -; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; X64-AVX-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovupd %xmm1, 16(%rsi) +; X64-AVX-NEXT: vmovupd %xmm0, (%rsi) ; X64-AVX-NEXT: retq entry: %0 = load <8 x double>, <8 x double>* %in diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index aeb1951fbef8..d37795b55cdc 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -693,8 +693,8 @@ define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: saddo_v8i32: @@ -824,48 +824,48 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm7 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm10 -; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm3 -; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpandn %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm1 +; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpandn %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm7 +; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpandn %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7 ; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm2 -; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpandn %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm1 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vmovaps %ymm3, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vmovdqa %xmm9, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm10, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: saddo_v16i32: diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index ab97c51df410..3f53f9f2250c 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -973,8 +973,8 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: smulo_v8i32: @@ -1266,59 +1266,59 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpmuldq %xmm4, %xmm6, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] -; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6 +; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm8 +; AVX1-NEXT: vpsrad $31, %xmm8, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpxor %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpxor %xmm8, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5],xmm4[6,7] +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpmuldq %xmm5, %xmm7, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpmuldq %xmm4, %xmm7, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; AVX1-NEXT: vpmulld %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpsrad $31, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3],xmm6[4,5],xmm1[6,7] +; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuldq %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] -; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpxor %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm9, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm9, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm3, (%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: smulo_v16i32: diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 15c0531d67a7..3dc73e3b4ba1 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -714,8 +714,8 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v8i32: @@ -850,52 +850,52 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm9, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm7 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm9, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm8 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm9, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm9, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm9, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm9, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm7, %xmm9, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm9, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm9, %xmm6 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm9, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm9, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm4 -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm9, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm3, (%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vmovdqa %xmm10, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm7, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: ssubo_v16i32: diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 41a0e258e3d1..4e9cd2efb74b 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -501,8 +501,8 @@ define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: uaddo_v8i32: @@ -633,19 +633,19 @@ define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vmovaps %ymm3, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: uaddo_v16i32: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 0c95b73853e9..0bcaacc21dfe 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -843,10 +843,10 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm8, %xmm5, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdi) +; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -1111,23 +1111,23 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpxor %xmm9, %xmm5, %xmm5 ; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 ; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm0 -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm3, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v16i32: diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index b662ac45caf6..c5a7b19cf14d 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -525,8 +525,8 @@ define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: usubo_v8i32: @@ -671,19 +671,19 @@ define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-NEXT: vmovaps %ymm3, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: usubo_v16i32: diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll index 8f62fe538256..693380a48ee2 100644 --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -122,74 +122,88 @@ define <64 x i16*> @AGEP9(i16* %param, <64 x i32> %off) nounwind { ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-32, %esp -; CHECK-NEXT: subl $96, %esp -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm4 -; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm3 -; CHECK-NEXT: vpaddd %xmm4, %xmm3, %xmm4 +; CHECK-NEXT: subl $160, %esp +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 +; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm5 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm4 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 40(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 56(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 72(%ebp), %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill +; CHECK-NEXT: vmovdqa 88(%ebp), %xmm4 ; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm3, %xmm4 +; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vmovdqa 104(%ebp), %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%esp) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm3, %xmm4 +; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 +; CHECK-NEXT: vmovdqa 120(%ebp), %xmm6 +; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 +; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 +; CHECK-NEXT: vmovdqa 136(%ebp), %xmm2 ; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; CHECK-NEXT: vmovdqa 40(%ebp), %xmm4 -; CHECK-NEXT: vmovdqa 56(%ebp), %xmm5 -; CHECK-NEXT: vpaddd %xmm5, %xmm5, %xmm5 -; CHECK-NEXT: vpaddd %xmm5, %xmm3, %xmm5 -; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm3, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; CHECK-NEXT: vmovdqa 72(%ebp), %xmm5 -; CHECK-NEXT: vmovdqa 88(%ebp), %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm3, %xmm6 -; CHECK-NEXT: vpaddd %xmm5, %xmm5, %xmm5 -; CHECK-NEXT: vpaddd %xmm5, %xmm3, %xmm5 -; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; CHECK-NEXT: vmovdqa 104(%ebp), %xmm6 -; CHECK-NEXT: vmovdqa 120(%ebp), %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm3, %xmm7 -; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm3, %xmm6 -; CHECK-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; CHECK-NEXT: vpaddd %xmm2, %xmm5, %xmm2 ; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 ; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm3, %xmm7 -; CHECK-NEXT: vmovdqa 136(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 +; CHECK-NEXT: vmovdqa 168(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa 184(%ebp), %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; CHECK-NEXT: vpaddd %xmm7, %xmm3, %xmm7 -; CHECK-NEXT: vmovdqa 168(%ebp), %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 ; CHECK-NEXT: movl 8(%ebp), %eax -; CHECK-NEXT: vmovaps %ymm1, 224(%eax) -; CHECK-NEXT: vmovaps %ymm0, 192(%eax) -; CHECK-NEXT: vmovaps %ymm6, 160(%eax) -; CHECK-NEXT: vmovaps %ymm5, 128(%eax) -; CHECK-NEXT: vmovaps %ymm4, 96(%eax) -; CHECK-NEXT: vmovaps %ymm2, 64(%eax) -; CHECK-NEXT: vmovaps (%esp), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %ymm0, 32(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %ymm0, (%eax) +; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) +; CHECK-NEXT: vmovdqa %xmm0, 224(%eax) +; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) +; CHECK-NEXT: vmovdqa %xmm2, 192(%eax) +; CHECK-NEXT: vmovdqa %xmm6, 176(%eax) +; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) +; CHECK-NEXT: vmovdqa %xmm4, 144(%eax) +; CHECK-NEXT: vmovaps (%esp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 128(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 112(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 96(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 80(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 64(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 48(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 32(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 16(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, (%eax) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-widen.ll index 54ebdbe026aa..6a504269b938 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-widen.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-widen.ll @@ -668,14 +668,14 @@ define void @trunc16i32_16i16(<16 x i32> %a) { ; AVX1-LABEL: trunc16i32_16i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -686,8 +686,8 @@ define void @trunc16i32_16i16(<16 x i32> %a) { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -717,16 +717,16 @@ define void @trunc16i32_16i16_ashr(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i16_ashr: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -792,16 +792,16 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i16_lshr: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1291,14 +1291,14 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1317,23 +1317,19 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX512F-LABEL: trunc32i16_32i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vpmovdb %zmm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc32i16_32i8: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqu %ymm0, (%rax) +; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 56e86a6bc95f..0027fbe2657d 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -678,14 +678,14 @@ define void @trunc16i32_16i16(<16 x i32> %a) { ; AVX1-LABEL: trunc16i32_16i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -696,8 +696,8 @@ define void @trunc16i32_16i16(<16 x i32> %a) { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -727,16 +727,16 @@ define void @trunc16i32_16i16_ashr(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i16_ashr: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -802,16 +802,16 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i16_lshr: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1301,14 +1301,14 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1327,23 +1327,19 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX512F-LABEL: trunc32i16_32i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vpmovdb %zmm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc32i16_32i8: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqu %ymm0, (%rax) +; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index bff39467c1eb..8cd01b631d60 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -341,11 +341,10 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm1, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf16_i8_stride4: @@ -358,11 +357,10 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) -; AVX2-NEXT: vmovdqa %ymm1, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovdqa %xmm0, 48(%rdi) +; AVX2-NEXT: vmovdqa %xmm4, 32(%rdi) +; AVX2-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride4: @@ -888,37 +886,20 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { } define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) { -; AVX1-LABEL: interleaved_store_vf8_i8_stride4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2OR512-LABEL: interleaved_store_vf8_i8_stride4: -; AVX2OR512: # %bb.0: -; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2OR512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2OR512-NEXT: vmovdqa %ymm0, (%rdi) -; AVX2OR512-NEXT: vzeroupper -; AVX2OR512-NEXT: retq +; AVX-LABEL: interleaved_store_vf8_i8_stride4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqa %xmm2, (%rdi) +; AVX-NEXT: retq %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> @@ -1096,10 +1077,9 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm1, (%rdi) ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX1-NEXT: vmovups %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf16_i8_stride3: @@ -1116,10 +1096,9 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX2-NEXT: vmovdqu %xmm1, (%rdi) ; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX2-NEXT: vmovdqu %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride3: