From 152e7c8b1222d2af61df72c08caaa740e553cb6c Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Sat, 9 Jul 2016 00:19:07 +0000 Subject: [PATCH] VirtRegMap: Replace some identity copies with KILL instructions. An identity COPY like this: %AL = COPY %AL, %EAX has no semantic effect, but encodes liveness information: Further users of %EAX only depend on this instruction even though it does not define the full register. Replace the COPY with a KILL instruction in those cases to maintain this liveness information. (This reverts a small part of r238588 but this time adds a comment explaining why a KILL instruction is useful). llvm-svn: 274952 --- llvm/lib/CodeGen/VirtRegMap.cpp | 36 +- .../test/CodeGen/AArch64/arm64-collect-loh.ll | 1 + llvm/test/CodeGen/PowerPC/machine-combiner.ll | 4 + llvm/test/CodeGen/SPARC/32abi.ll | 2 + llvm/test/CodeGen/X86/anyext.ll | 4 + llvm/test/CodeGen/X86/atomic-eflags-reuse.ll | 3 +- llvm/test/CodeGen/X86/avx-cast.ll | 7 + .../CodeGen/X86/avx-intrinsics-fast-isel.ll | 28 + .../CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 2 + llvm/test/CodeGen/X86/avx2-conversions.ll | 2 + .../CodeGen/X86/avx2-intrinsics-fast-isel.ll | 2 + llvm/test/CodeGen/X86/avx2-vector-shifts.ll | 3 + llvm/test/CodeGen/X86/avx512-arith.ll | 8 +- llvm/test/CodeGen/X86/avx512-calling-conv.ll | 6 +- llvm/test/CodeGen/X86/avx512-cvt.ll | 15 +- llvm/test/CodeGen/X86/avx512-ext.ll | 14 + .../CodeGen/X86/avx512-extract-subvector.ll | 4 +- .../test/CodeGen/X86/avx512-insert-extract.ll | 498 +- .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 8 + llvm/test/CodeGen/X86/avx512-intrinsics.ll | 15 + llvm/test/CodeGen/X86/avx512-mask-op.ll | 11 + llvm/test/CodeGen/X86/avx512-select.ll | 3 + llvm/test/CodeGen/X86/avx512-trunc.ll | 15 + llvm/test/CodeGen/X86/avx512-vbroadcast.ll | 4 + llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 466 +- llvm/test/CodeGen/X86/avx512bw-mov.ll | 8 + .../X86/avx512bwvl-intrinsics-upgrade.ll | 4 + .../test/CodeGen/X86/avx512bwvl-intrinsics.ll | 17 + llvm/test/CodeGen/X86/avx512dq-intrinsics.ll | 10 + llvm/test/CodeGen/X86/avx512dq-mask-op.ll | 2 + .../test/CodeGen/X86/avx512dqvl-intrinsics.ll | 10 + .../X86/avx512vl-intrinsics-upgrade.ll | 16 + llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 14 + llvm/test/CodeGen/X86/bitreverse.ll | 2 + llvm/test/CodeGen/X86/bmi.ll | 5 +- llvm/test/CodeGen/X86/clz.ll | 6 + llvm/test/CodeGen/X86/extractelement-index.ll | 20 + .../CodeGen/X86/f16c-intrinsics-fast-isel.ll | 2 + llvm/test/CodeGen/X86/fixup-bw-copy.ll | 3 +- llvm/test/CodeGen/X86/h-registers-3.ll | 4 + llvm/test/CodeGen/X86/machine-combiner-int.ll | 5 + .../test/CodeGen/X86/masked_gather_scatter.ll | 67 +- llvm/test/CodeGen/X86/masked_memop.ll | 8037 ++++++++++++++++- llvm/test/CodeGen/X86/materialize.ll | 2 + llvm/test/CodeGen/X86/movmsk.ll | 1 + llvm/test/CodeGen/X86/or-lea.ll | 13 + llvm/test/CodeGen/X86/pmul.ll | 2 + llvm/test/CodeGen/X86/pr28173.ll | 1 + llvm/test/CodeGen/X86/promote-i16.ll | 2 + .../CodeGen/X86/tbm-intrinsics-fast-isel.ll | 4 + llvm/test/CodeGen/X86/urem-i8-constant.ll | 1 + llvm/test/CodeGen/X86/urem-power-of-two.ll | 4 +- llvm/test/CodeGen/X86/vec_fp_to_int.ll | 1 + llvm/test/CodeGen/X86/vec_insert-5.ll | 1 + llvm/test/CodeGen/X86/vec_insert-mmx.ll | 1 + llvm/test/CodeGen/X86/vec_int_to_fp.ll | 19 + llvm/test/CodeGen/X86/vec_ss_load_fold.ll | 45 +- .../CodeGen/X86/vec_uint_to_fp-fastmath.ll | 4 + llvm/test/CodeGen/X86/vector-bitreverse.ll | 8 + .../CodeGen/X86/vector-compare-results.ll | 10 + .../CodeGen/X86/vector-half-conversions.ll | 49 + llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll | 64 + llvm/test/CodeGen/X86/vector-lzcnt-128.ll | 8 + llvm/test/CodeGen/X86/vector-lzcnt-256.ll | 8 + llvm/test/CodeGen/X86/vector-sext.ll | 3 + .../test/CodeGen/X86/vector-shift-ashr-128.ll | 7 + .../test/CodeGen/X86/vector-shift-ashr-256.ll | 5 + .../test/CodeGen/X86/vector-shift-lshr-128.ll | 7 + .../test/CodeGen/X86/vector-shift-lshr-256.ll | 5 + llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 6 + llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 5 + .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 2 + .../X86/vector-shuffle-combining-avx2.ll | 6 + llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 49 +- .../X86/vector-shuffle-variable-128.ll | 78 + llvm/test/CodeGen/X86/vector-trunc-math.ll | 74 + llvm/test/CodeGen/X86/vector-trunc.ll | 9 + llvm/test/CodeGen/X86/widen_bitops-0.ll | 18 + .../CodeGen/X86/x86-shrink-wrap-unwind.ll | 2 +- llvm/test/CodeGen/X86/xaluo.ll | 1 + 80 files changed, 9755 insertions(+), 163 deletions(-) diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index 7713adb3c7b0..8a3a0328870d 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -166,6 +166,7 @@ class VirtRegRewriter : public MachineFunctionPass { void addMBBLiveIns(); bool readsUndefSubreg(const MachineOperand &MO) const; void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const; + void handleIdentityCopy(MachineInstr &MI) const; public: static char ID; @@ -346,6 +347,30 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const { return true; } +void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const { + if (!MI.isIdentityCopy()) + return; + DEBUG(dbgs() << "Identity copy: " << MI); + ++NumIdCopies; + + // Copies like: + // %R0 = COPY %R0 + // %AL = COPY %AL, %EAX + // give us additional liveness information: The target (super-)register + // must not be valid before this point. Replace the COPY with a KILL + // instruction to maintain this information. + if (MI.getOperand(0).isUndef() || MI.getNumOperands() > 2) { + MI.setDesc(TII->get(TargetOpcode::KILL)); + DEBUG(dbgs() << " replace by: " << MI); + return; + } + + if (Indexes) + Indexes->removeMachineInstrFromMaps(MI); + MI.eraseFromParent(); + DEBUG(dbgs() << " deleted.\n"); +} + void VirtRegRewriter::rewrite() { bool NoSubRegLiveness = !MRI->subRegLivenessEnabled(); SmallVector SuperDeads; @@ -435,15 +460,8 @@ void VirtRegRewriter::rewrite() { DEBUG(dbgs() << "> " << *MI); - // Finally, remove any identity copies. - if (MI->isIdentityCopy()) { - ++NumIdCopies; - DEBUG(dbgs() << "Deleting identity copy.\n"); - if (Indexes) - Indexes->removeMachineInstrFromMaps(*MI); - // It's safe to erase MI because MII has already been incremented. - MI->eraseFromParent(); - } + // We can remove identity copies right now. + handleIdentityCopy(*MI); } } } diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll index 3f633ba2a44d..3fc0d45f065c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -613,6 +613,7 @@ define <1 x i8> @getL() { ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF] +; CHECK-NEXT: ; kill ; Ultimately we should generate str b0, but right now, we match the vector ; variant which does not allow to fold the immediate into the store. ; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]] diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll index 93fb2020d530..ae9e2e8cf6a6 100644 --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -98,6 +98,7 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, < ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] +; CHECK-PWR: # kill ; CHECK-NEXT: blr %t0 = fadd <4 x float> %x0, %x1 @@ -115,6 +116,7 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, < ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] +; CHECK-PWR: # kill ; CHECK-NEXT: blr %t0 = fadd <4 x float> %x0, %x1 @@ -132,6 +134,7 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, < ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] +; CHECK-PWR: # kill ; CHECK-NEXT: blr %t0 = fadd <4 x float> %x0, %x1 @@ -149,6 +152,7 @@ define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, < ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] +; CHECK-PWR: # kill ; CHECK-NEXT: blr %t0 = fadd <4 x float> %x0, %x1 diff --git a/llvm/test/CodeGen/SPARC/32abi.ll b/llvm/test/CodeGen/SPARC/32abi.ll index ccea1adceaf9..09e7a3a09d86 100644 --- a/llvm/test/CodeGen/SPARC/32abi.ll +++ b/llvm/test/CodeGen/SPARC/32abi.ll @@ -60,9 +60,11 @@ define void @call_intarg(i32 %i0, i8* %i1) { ; HARD: mov %i5, %g2 ; HARD-NEXT: ld [%fp+92], %g3 ; HARD-NEXT: mov %i4, %i5 +; HARD-NEXT: ! kill ; HARD-NEXT: std %g2, [%fp+-24] ; HARD-NEXT: mov %i3, %i4 ; HARD-NEXT: std %i4, [%fp+-16] +; HARD-NEXT: ! kill ; HARD-NEXT: std %i0, [%fp+-8] ; HARD-NEXT: st %i2, [%fp+-28] ; HARD-NEXT: ld [%fp+104], %f0 diff --git a/llvm/test/CodeGen/X86/anyext.ll b/llvm/test/CodeGen/X86/anyext.ll index 3c53983fe4e0..4f4218bdd63d 100644 --- a/llvm/test/CodeGen/X86/anyext.ll +++ b/llvm/test/CodeGen/X86/anyext.ll @@ -8,6 +8,7 @@ define i32 @foo(i32 %p, i8 zeroext %x) nounwind { ; X32-LABEL: foo: ; X32: # BB#0: ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: # kill: %EAX %EAX %AX ; X32-NEXT: divb {{[0-9]+}}(%esp) ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: andl $1, %eax @@ -16,6 +17,7 @@ define i32 @foo(i32 %p, i8 zeroext %x) nounwind { ; X64-LABEL: foo: ; X64: # BB#0: ; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: # kill: %EAX %EAX %AX ; X64-NEXT: divb %sil ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: andl $1, %eax @@ -33,6 +35,7 @@ define i32 @bar(i32 %p, i16 zeroext %x) nounwind { ; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: divw {{[0-9]+}}(%esp) +; X32-NEXT: # kill: %AX %AX %EAX ; X32-NEXT: andl $1, %eax ; X32-NEXT: retl ; @@ -41,6 +44,7 @@ define i32 @bar(i32 %p, i16 zeroext %x) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: divw %si +; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: andl $1, %eax ; X64-NEXT: retq %q = trunc i32 %p to i16 diff --git a/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll b/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll index fe872bdde12a..dc1814b55cd3 100644 --- a/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll +++ b/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s define i32 @test_add_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 { @@ -64,6 +64,7 @@ define i8 @test_add_1_setcc_slt(i64* %p) #0 { ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: lock xaddq %rax, (%rdi) ; CHECK-NEXT: shrq $63, %rax +; CHECK-NEXT: # kill: %AL %AL %RAX ; CHECK-NEXT: retq entry: %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst diff --git a/llvm/test/CodeGen/X86/avx-cast.ll b/llvm/test/CodeGen/X86/avx-cast.ll index 8dd3529eb483..103715c3628e 100644 --- a/llvm/test/CodeGen/X86/avx-cast.ll +++ b/llvm/test/CodeGen/X86/avx-cast.ll @@ -9,6 +9,7 @@ define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp { ; AVX-LABEL: castA: ; AVX: ## BB#0: +; AVX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: retq @@ -19,6 +20,7 @@ define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp { define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp { ; AVX-LABEL: castB: ; AVX: ## BB#0: +; AVX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: retq @@ -31,12 +33,14 @@ define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp { define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp { ; AVX1-LABEL: castC: ; AVX1: ## BB#0: +; AVX1-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: castC: ; AVX2: ## BB#0: +; AVX2-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq @@ -50,6 +54,7 @@ define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp { define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp { ; AVX-LABEL: castD: ; AVX: ## BB#0: +; AVX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> @@ -59,6 +64,7 @@ define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp { define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp { ; AVX-LABEL: castE: ; AVX: ## BB#0: +; AVX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> @@ -68,6 +74,7 @@ define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp { define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp { ; AVX-LABEL: castF: ; AVX: ## BB#0: +; AVX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 728fa1ce87ca..cca8f99ec01f 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -318,10 +318,12 @@ define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind { define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind { ; X32-LABEL: test_mm256_castpd128_pd256: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_castpd128_pd256: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: retq %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> ret <4 x double> %res @@ -330,11 +332,13 @@ define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind { define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind { ; X32-LABEL: test_mm256_castpd256_pd128: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_castpd256_pd128: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> @@ -368,10 +372,12 @@ define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind { define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind { ; X32-LABEL: test_mm256_castps128_ps256: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_castps128_ps256: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: retq %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> ret <8 x float> %res @@ -380,11 +386,13 @@ define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind { define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind { ; X32-LABEL: test_mm256_castps256_ps128: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_castps256_ps128: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> @@ -394,10 +402,12 @@ define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind { define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind { ; X32-LABEL: test_mm256_castsi128_si256: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_castsi128_si256: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: retq %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> ret <4 x i64> %res @@ -430,11 +440,13 @@ define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind { define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind { ; X32-LABEL: test_mm256_castsi256_si128: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_castsi256_si128: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> @@ -1032,11 +1044,13 @@ define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind { define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm256_insertf128_pd: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_insertf128_pd: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; X64-NEXT: retq %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> @@ -1062,11 +1076,13 @@ define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) n define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm256_insertf128_si256: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_insertf128_si256: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; X64-NEXT: retq %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> @@ -2173,11 +2189,13 @@ define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) noun define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm256_set_m128: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_m128: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> @@ -2187,11 +2205,13 @@ define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwi define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm256_set_m128d: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_m128d: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x double> %a0 to <4 x float> @@ -2204,11 +2224,13 @@ define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) no define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm256_set_m128i: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_m128i: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <4 x float> @@ -2804,11 +2826,13 @@ define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nou define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind { ; X32-LABEL: test_mm256_setr_m128: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_setr_m128: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> @@ -2818,11 +2842,13 @@ define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounw define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm256_setr_m128d: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_setr_m128d: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x double> %a0 to <4 x float> @@ -2835,11 +2861,13 @@ define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) n define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm256_setr_m128i: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_setr_m128i: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <4 x float> diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index fae47b24f3b6..a7b4c6b285d8 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -38,6 +38,7 @@ define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1 define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1 %XMM1 %YMM1 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; CHECK-NEXT: retl %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) @@ -86,6 +87,7 @@ declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind read define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) { ; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2) diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll index 6567b933ad78..3e4dcb0a3d88 100644 --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -6,6 +6,7 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; CHECK: ## BB#0: ; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; CHECK-NEXT: ## kill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %B = trunc <4 x i64> %A to <4 x i32> @@ -17,6 +18,7 @@ define <8 x i16> @trunc8(<8 x i32> %A) nounwind { ; CHECK: ## BB#0: ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: ## kill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %B = trunc <8 x i32> %A to <8 x i16> diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index 4f410b415c08..9daeb8be9720 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1708,11 +1708,13 @@ define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test0_mm256_inserti128_si256: ; X32: # BB#0: +; X32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; X32-NEXT: retl ; ; X64-LABEL: test0_mm256_inserti128_si256: ; X64: # BB#0: +; X64-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; X64-NEXT: retq %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll index 64c8a0ef68dc..b304d8456d90 100644 --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -279,6 +279,7 @@ define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shl = shl <8 x i16> %r, %a @@ -330,6 +331,7 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %ashr = ashr <8 x i16> %r, %a @@ -394,6 +396,7 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %lshr = lshr <8 x i16> %r, %a diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 515f571b1a43..14337a4bbc0d 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX512F %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck --check-prefix=CHECK --check-prefix=AVX512VL %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=AVX512BW %s @@ -675,6 +675,7 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, ; AVX512F-LABEL: test_mask_vminpd: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM3 %YMM3 %ZMM3 ; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 ; AVX512F-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} @@ -689,6 +690,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, ; ; AVX512BW-LABEL: test_mask_vminpd: ; AVX512BW: ## BB#0: +; AVX512BW-NEXT: ## kill: %YMM3 %YMM3 %ZMM3 ; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 ; AVX512BW-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} @@ -696,6 +698,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, ; ; AVX512DQ-LABEL: test_mask_vminpd: ; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: ## kill: %YMM3 %YMM3 %ZMM3 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 ; AVX512DQ-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} @@ -735,6 +738,7 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, ; AVX512F-LABEL: test_mask_vmaxpd: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM3 %YMM3 %ZMM3 ; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 ; AVX512F-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} @@ -749,6 +753,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, ; ; AVX512BW-LABEL: test_mask_vmaxpd: ; AVX512BW: ## BB#0: +; AVX512BW-NEXT: ## kill: %YMM3 %YMM3 %ZMM3 ; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 ; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} @@ -756,6 +761,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, ; ; AVX512DQ-LABEL: test_mask_vmaxpd: ; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: ## kill: %YMM3 %YMM3 %ZMM3 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 ; AVX512DQ-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 112e96cbbbbf..303e7ac51824 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32 @@ -132,6 +132,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) { ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: callq _func8xi1 ; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL-NEXT: vpslld $31, %ymm0, %ymm0 @@ -160,6 +161,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) { ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 +; KNL_X32-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL_X32-NEXT: calll _func8xi1 ; KNL_X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_X32-NEXT: vpslld $31, %ymm0, %ymm0 @@ -277,6 +279,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: callq _func8xi1 ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 @@ -312,6 +315,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 +; KNL_X32-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL_X32-NEXT: calll _func8xi1 ; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL_X32-NEXT: vpsllvq LCPI7_0, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 42ba5d466e07..57e8a1341513 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX @@ -90,6 +90,7 @@ define <2 x float> @sltof2f32(<2 x i64> %a) { ; ; SKX-LABEL: sltof2f32: ; SKX: ## BB#0: +; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0 ; SKX-NEXT: retq %b = sitofp <2 x i64> %a to <2 x float> @@ -287,7 +288,9 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind { define <8 x i32> @fptoui_256(<8 x float> %a) nounwind { ; KNL-LABEL: fptoui_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vcvttps2udq %zmm0, %zmm0 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: fptoui_256: @@ -301,7 +304,9 @@ define <8 x i32> @fptoui_256(<8 x float> %a) nounwind { define <4 x i32> @fptoui_128(<4 x float> %a) nounwind { ; KNL-LABEL: fptoui_128: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: vcvttps2udq %zmm0, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: fptoui_128: @@ -324,7 +329,9 @@ define <8 x i32> @fptoui01(<8 x double> %a) nounwind { define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind { ; KNL-LABEL: fptoui_256d: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: fptoui_256d: @@ -583,7 +590,9 @@ define <16 x double> @uitof64(<16 x i32> %a) nounwind { define <4 x double> @uitof64_256(<4 x i32> %a) nounwind { ; KNL-LABEL: uitof64_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: uitof64_256: @@ -606,7 +615,9 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind { define <8 x float> @uitof32_256(<8 x i32> %a) nounwind { ; KNL-LABEL: uitof32_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: uitof32_256: @@ -620,7 +631,9 @@ define <8 x float> @uitof32_256(<8 x i32> %a) nounwind { define <4 x float> @uitof32_128(<4 x i32> %a) nounwind { ; KNL-LABEL: uitof32_128: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: uitof32_128: diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index cb74c598a1aa..a944e85f71b0 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -348,6 +348,7 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_8x8mem_to_8x32: @@ -371,6 +372,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re ; KNL-NEXT: vpmovsxbd (%rdi), %ymm0 ; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_8x8mem_to_8x32: @@ -705,6 +707,7 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_8x16mem_to_8x32: @@ -728,6 +731,7 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw ; KNL-NEXT: vpmovsxwd (%rdi), %ymm0 ; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_8x16mem_to_8x32mask: @@ -761,6 +765,7 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind ; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_8x16_to_8x32mask: @@ -1323,6 +1328,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: ## kill: %AX %AX %EAX ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_16i8_to_16i1: @@ -1330,6 +1336,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ; SKX-NEXT: vpmovb2m %xmm0, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq %mask_b = trunc <16 x i8>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 @@ -1342,6 +1349,7 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { ; ALL-NEXT: vpslld $31, %zmm0, %zmm0 ; ALL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; ALL-NEXT: kmovw %k0, %eax +; ALL-NEXT: ## kill: %AX %AX %EAX ; ALL-NEXT: retq %mask_b = trunc <16 x i32>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 @@ -1379,6 +1387,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: ## kill: %AL %AL %EAX ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_8i16_to_8i1: @@ -1386,6 +1395,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ; SKX-NEXT: vpmovw2m %xmm0, %k0 ; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq %mask_b = trunc <8 x i16>%a to <8 x i1> %mask = bitcast <8 x i1> %mask_b to i8 @@ -1395,6 +1405,8 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; KNL-LABEL: sext_8i1_8i32: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ; KNL-NEXT: knotw %k0, %k1 ; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} @@ -1423,6 +1435,7 @@ define i16 @trunc_i32_to_i1(i32 %a) { ; ALL-NEXT: kmovw %eax, %k1 ; ALL-NEXT: korw %k0, %k1, %k0 ; ALL-NEXT: kmovw %k0, %eax +; ALL-NEXT: ## kill: %AX %AX %EAX ; ALL-NEXT: retq %a_i = trunc i32 %a to i1 %maskv = insertelement <16 x i1> , i1 %a_i, i32 0 @@ -1435,6 +1448,7 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; KNL: ## BB#0: ; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_8i1_8i16: diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector.ll index 89db1867928f..8bd57c0fc1da 100644 --- a/llvm/test/CodeGen/X86/avx512-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s @@ -14,6 +14,7 @@ define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind { define <8 x i16> @extract_subvector128_v32i16_first_element(<32 x i16> %x) nounwind { ; SKX-LABEL: extract_subvector128_v32i16_first_element: ; SKX: ## BB#0: +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; SKX-NEXT: retq %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> ret <8 x i16> %r1 @@ -31,6 +32,7 @@ define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind { define <16 x i8> @extract_subvector128_v64i8_first_element(<64 x i8> %x) nounwind { ; SKX-LABEL: extract_subvector128_v64i8_first_element: ; SKX: ## BB#0: +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; SKX-NEXT: retq %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> ret <16 x i8> %r1 diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 44ecad014071..2c42aca33e45 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -1,11 +1,25 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s -;CHECK-LABEL: test1: -;CHECK: vinsertps -;CHECK: vinsertf32x4 -;CHECK: ret define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { +; KNL-LABEL: test1: +; KNL: ## BB#0: +; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] +; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; KNL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test1: +; SKX: ## BB#0: +; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] +; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; SKX-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; SKX-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: retq %rrr = load float, float* %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14 @@ -15,19 +29,19 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { ; KNL-LABEL: test2: ; KNL: ## BB#0: -; KNL-NEXT: vmovhpd (%rdi), %xmm0, %xmm2 +; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] ; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 ; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; KNL-NEXT: vmovsd %xmm1, %xmm2, %xmm1 +; KNL-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1] ; KNL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test2: ; SKX: ## BB#0: -; SKX-NEXT: vmovhpd (%rdi), %xmm0, %xmm2 +; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] ; SKX-NEXT: vinsertf64x2 $0, %xmm2, %zmm0, %zmm0 ; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm2 -; SKX-NEXT: vmovsd %xmm1, %xmm2, %xmm1 +; SKX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1] ; SKX-NEXT: vinsertf64x2 $3, %xmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %rrr = load double, double* %br @@ -36,11 +50,20 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { ret <8 x double> %rrr3 } -;CHECK-LABEL: test3: -;CHECK: vextractf32x4 $1 -;CHECK: vinsertf32x4 $0 -;CHECK: ret define <16 x float> @test3(<16 x float> %x) nounwind { +; KNL-LABEL: test3: +; KNL: ## BB#0: +; KNL-NEXT: vextractf32x4 $1, %zmm0, %xmm1 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] +; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test3: +; SKX: ## BB#0: +; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm1 +; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] +; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: retq %eee = extractelement <16 x float> %x, i32 4 %rrr2 = insertelement <16 x float> %x, float %eee, i32 1 ret <16 x float> %rrr2 @@ -67,70 +90,140 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind { ret <8 x i64> %rrr2 } -;CHECK-LABEL: test5: -;CHECK: vextractps -;CHECK: ret define i32 @test5(<4 x float> %x) nounwind { +; KNL-LABEL: test5: +; KNL: ## BB#0: +; KNL-NEXT: vextractps $3, %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test5: +; SKX: ## BB#0: +; SKX-NEXT: vextractps $3, %xmm0, %eax +; SKX-NEXT: retq %ef = extractelement <4 x float> %x, i32 3 %ei = bitcast float %ef to i32 ret i32 %ei } -;CHECK-LABEL: test6: -;CHECK: vextractps {{.*}}, (%rdi) -;CHECK: ret define void @test6(<4 x float> %x, float* %out) nounwind { +; KNL-LABEL: test6: +; KNL: ## BB#0: +; KNL-NEXT: vextractps $3, %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: test6: +; SKX: ## BB#0: +; SKX-NEXT: vextractps $3, %xmm0, (%rdi) +; SKX-NEXT: retq %ef = extractelement <4 x float> %x, i32 3 store float %ef, float* %out, align 4 ret void } -;CHECK-LABEL: test7 -;CHECK: vmovd -;CHECK: vpermps %zmm -;CHECK: ret define float @test7(<16 x float> %x, i32 %ind) nounwind { +; KNL-LABEL: test7: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test7: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq %e = extractelement <16 x float> %x, i32 %ind ret float %e } -;CHECK-LABEL: test8 -;CHECK: vmovq -;CHECK: vpermpd %zmm -;CHECK: ret define double @test8(<8 x double> %x, i32 %ind) nounwind { +; KNL-LABEL: test8: +; KNL: ## BB#0: +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test8: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq %e = extractelement <8 x double> %x, i32 %ind ret double %e } -;CHECK-LABEL: test9 -;CHECK: vmovd -;CHECK: vpermps %ymm -;CHECK: ret define float @test9(<8 x float> %x, i32 %ind) nounwind { +; KNL-LABEL: test9: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test9: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: retq %e = extractelement <8 x float> %x, i32 %ind ret float %e } -;CHECK-LABEL: test10 -;CHECK: vmovd -;CHECK: vpermd %zmm -;CHECK: vmovd %xmm0, %eax -;CHECK: ret define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { +; KNL-LABEL: test10: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test10: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: retq %e = extractelement <16 x i32> %x, i32 %ind ret i32 %e } -;CHECK-LABEL: test11 -;CHECK: vpcmpltud -;CHECK: kshiftlw $11 -;CHECK: kshiftrw $15 -;CHECK: testb -;CHECK: je -;CHECK: ret -;CHECK: ret define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { +; KNL-LABEL: test11: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb %al, %al +; KNL-NEXT: je LBB10_2 +; KNL-NEXT: ## BB#1: ## %A +; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: retq +; KNL-NEXT: LBB10_2: ## %B +; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test11: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; SKX-NEXT: kshiftlw $11, %k0, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: je LBB10_2 +; SKX-NEXT: ## BB#1: ## %A +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq +; SKX-NEXT: LBB10_2: ## %B +; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq %cmp_res = icmp ult <16 x i32> %a, %b %ia = extractelement <16 x i1> %cmp_res, i32 4 br i1 %ia, label %A, label %B @@ -141,70 +234,144 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { ret <16 x i32>%c } -;CHECK-LABEL: test12 -;CHECK: vpcmpgtq -;CHECK: kshiftlw $15 -;CHECK: kshiftrw $15 -;CHECK: testb -;CHECK: ret - define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) { - +; KNL-LABEL: test12: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 +; KNL-NEXT: vpcmpgtq %zmm1, %zmm3, %k1 +; KNL-NEXT: kunpckbw %k0, %k1, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb %al, %al +; KNL-NEXT: cmoveq %rsi, %rdi +; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test12: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 +; SKX-NEXT: vpcmpgtq %zmm1, %zmm3, %k1 +; SKX-NEXT: kunpckbw %k0, %k1, %k0 +; SKX-NEXT: kshiftlw $15, %k0, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: cmoveq %rsi, %rdi +; SKX-NEXT: movq %rdi, %rax +; SKX-NEXT: retq %cmpvector_func.i = icmp slt <16 x i64> %a, %b %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0 %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1 ret i64 %res } -;CHECK-LABEL: test13 -;CHECK: cmpl %esi, %edi -;CHECK: setb %al -;CHECK: kmovw %eax, %k0 -;CHECK: movw $-4 -;CHECK: korw define i16 @test13(i32 %a, i32 %b) { +; KNL-LABEL: test13: +; KNL: ## BB#0: +; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: movw $-4, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: retq +; +; SKX-LABEL: test13: +; SKX: ## BB#0: +; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: setb %al +; SKX-NEXT: kmovw %eax, %k0 +; SKX-NEXT: movw $-4, %ax +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: retq %cmp_res = icmp ult i32 %a, %b %maskv = insertelement <16 x i1> , i1 %cmp_res, i32 0 %res = bitcast <16 x i1> %maskv to i16 ret i16 %res } -;CHECK-LABEL: test14 -;CHECK: vpcmpgtq -;KNL: kshiftlw $11 -;KNL: kshiftrw $15 -;KNL: testb -;SKX: kshiftlb $3 -;SKX: kshiftrb $7 -;SKX: testb -;CHECK: ret - define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { - +; KNL-LABEL: test14: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb %al, %al +; KNL-NEXT: cmoveq %rsi, %rdi +; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test14: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 +; SKX-NEXT: kshiftlb $3, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb %al, %al +; SKX-NEXT: cmoveq %rsi, %rdi +; SKX-NEXT: movq %rdi, %rax +; SKX-NEXT: retq %cmpvector_func.i = icmp slt <8 x i64> %a, %b %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4 %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1 ret i64 %res } -;CHECK-LABEL: test15 -;CHECK: movb (%rdi), %al -;CHECK: movw $-1, %ax -;CHECK: cmovew define i16 @test15(i1 *%addr) { +; KNL-LABEL: test15: +; KNL: ## BB#0: +; KNL-NEXT: movb (%rdi), %al +; KNL-NEXT: xorl %ecx, %ecx +; KNL-NEXT: testb %al, %al +; KNL-NEXT: movw $-1, %ax +; KNL-NEXT: cmovew %cx, %ax +; KNL-NEXT: retq +; +; SKX-LABEL: test15: +; SKX: ## BB#0: +; SKX-NEXT: movb (%rdi), %al +; SKX-NEXT: xorl %ecx, %ecx +; SKX-NEXT: testb %al, %al +; SKX-NEXT: movw $-1, %ax +; SKX-NEXT: cmovew %cx, %ax +; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 1 %x1 = insertelement <16 x i1> undef, i1 %x, i32 10 %x2 = bitcast <16 x i1>%x1 to i16 ret i16 %x2 } -;CHECK-LABEL: test16 -;CHECK: movzbl (%rdi), %eax -;CHECK: kmovw -;CHECK: kshiftlw $10 -;CHECK: korw -;CHECK: ret define i16 @test16(i1 *%addr, i16 %a) { +; KNL-LABEL: test16: +; KNL: ## BB#0: +; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: retq +; +; SKX-LABEL: test16: +; SKX: ## BB#0: +; SKX-NEXT: movzbl (%rdi), %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: kmovd %eax, %k0 +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: kshiftlw $10, %k0, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 %a1 = bitcast i16 %a to <16 x i1> %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10 @@ -212,15 +379,30 @@ define i16 @test16(i1 *%addr, i16 %a) { ret i16 %x2 } -;CHECK-LABEL: test17 -;KNL: movzbl (%rdi), %eax -;KNL: andl $1, %eax -;KNL: kshiftlw $4 -;KNL: korw -;SKX: kshiftlb $4 -;SKX: korb -;CHECK: ret define i8 @test17(i1 *%addr, i8 %a) { +; KNL-LABEL: test17: +; KNL: ## BB#0: +; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: retq +; +; SKX-LABEL: test17: +; SKX: ## BB#0: +; SKX-NEXT: movzbl (%rdi), %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: kmovd %eax, %k0 +; SKX-NEXT: kmovb %esi, %k1 +; SKX-NEXT: kshiftlb $4, %k0, %k0 +; SKX-NEXT: korb %k0, %k1, %k0 +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: ## kill: %AL %AL %EAX +; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 %a1 = bitcast i8 %a to <8 x i1> %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4 @@ -229,6 +411,13 @@ define i8 @test17(i1 *%addr, i8 %a) { } define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) { +; KNL-LABEL: extract_v8i64: +; KNL: ## BB#0: +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; KNL-NEXT: vpextrq $1, %xmm0, (%rdi) +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v8i64: ; SKX: ## BB#0: ; SKX-NEXT: vpextrq $1, %xmm0, %rax @@ -242,6 +431,13 @@ define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) { } define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) { +; KNL-LABEL: extract_v4i64: +; KNL: ## BB#0: +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpextrq $1, %xmm0, (%rdi) +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v4i64: ; SKX: ## BB#0: ; SKX-NEXT: vpextrq $1, %xmm0, %rax @@ -255,6 +451,12 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) { } define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) { +; KNL-LABEL: extract_v2i64: +; KNL: ## BB#0: +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vpextrq $1, %xmm0, (%rdi) +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v2i64: ; SKX: ## BB#0: ; SKX-NEXT: vmovq %xmm0, %rax @@ -267,6 +469,13 @@ define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) { } define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) { +; KNL-LABEL: extract_v16i32: +; KNL: ## BB#0: +; KNL-NEXT: vpextrd $1, %xmm0, %eax +; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; KNL-NEXT: vpextrd $1, %xmm0, (%rdi) +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v16i32: ; SKX: ## BB#0: ; SKX-NEXT: vpextrd $1, %xmm0, %eax @@ -280,6 +489,13 @@ define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) { } define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) { +; KNL-LABEL: extract_v8i32: +; KNL: ## BB#0: +; KNL-NEXT: vpextrd $1, %xmm0, %eax +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpextrd $1, %xmm0, (%rdi) +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v8i32: ; SKX: ## BB#0: ; SKX-NEXT: vpextrd $1, %xmm0, %eax @@ -293,6 +509,12 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) { } define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) { +; KNL-LABEL: extract_v4i32: +; KNL: ## BB#0: +; KNL-NEXT: vpextrd $1, %xmm0, %eax +; KNL-NEXT: vpextrd $3, %xmm0, (%rdi) +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v4i32: ; SKX: ## BB#0: ; SKX-NEXT: vpextrd $1, %xmm0, %eax @@ -305,11 +527,20 @@ define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) { } define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) { +; KNL-LABEL: extract_v32i16: +; KNL: ## BB#0: +; KNL-NEXT: vpextrw $1, %xmm0, %eax +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpextrw $1, %xmm0, (%rdi) +; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v32i16: ; SKX: ## BB#0: ; SKX-NEXT: vpextrw $1, %xmm0, %eax ; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) +; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq %r1 = extractelement <32 x i16> %x, i32 1 %r2 = extractelement <32 x i16> %x, i32 9 @@ -318,11 +549,20 @@ define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) { } define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) { +; KNL-LABEL: extract_v16i16: +; KNL: ## BB#0: +; KNL-NEXT: vpextrw $1, %xmm0, %eax +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpextrw $1, %xmm0, (%rdi) +; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v16i16: ; SKX: ## BB#0: ; SKX-NEXT: vpextrw $1, %xmm0, %eax ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) +; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq %r1 = extractelement <16 x i16> %x, i32 1 %r2 = extractelement <16 x i16> %x, i32 9 @@ -331,10 +571,18 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) { } define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) { +; KNL-LABEL: extract_v8i16: +; KNL: ## BB#0: +; KNL-NEXT: vpextrw $1, %xmm0, %eax +; KNL-NEXT: vpextrw $3, %xmm0, (%rdi) +; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v8i16: ; SKX: ## BB#0: ; SKX-NEXT: vpextrw $1, %xmm0, %eax ; SKX-NEXT: vpextrw $3, %xmm0, (%rdi) +; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq %r1 = extractelement <8 x i16> %x, i32 1 %r2 = extractelement <8 x i16> %x, i32 3 @@ -343,11 +591,20 @@ define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) { } define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) { +; KNL-LABEL: extract_v64i8: +; KNL: ## BB#0: +; KNL-NEXT: vpextrb $1, %xmm0, %eax +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpextrb $1, %xmm0, (%rdi) +; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v64i8: ; SKX: ## BB#0: ; SKX-NEXT: vpextrb $1, %xmm0, %eax ; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) +; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq %r1 = extractelement <64 x i8> %x, i32 1 %r2 = extractelement <64 x i8> %x, i32 17 @@ -356,11 +613,20 @@ define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) { } define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) { +; KNL-LABEL: extract_v32i8: +; KNL: ## BB#0: +; KNL-NEXT: vpextrb $1, %xmm0, %eax +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpextrb $1, %xmm0, (%rdi) +; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v32i8: ; SKX: ## BB#0: ; SKX-NEXT: vpextrb $1, %xmm0, %eax ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) +; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq %r1 = extractelement <32 x i8> %x, i32 1 %r2 = extractelement <32 x i8> %x, i32 17 @@ -369,10 +635,18 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) { } define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) { +; KNL-LABEL: extract_v16i8: +; KNL: ## BB#0: +; KNL-NEXT: vpextrb $1, %xmm0, %eax +; KNL-NEXT: vpextrb $3, %xmm0, (%rdi) +; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: retq +; ; SKX-LABEL: extract_v16i8: ; SKX: ## BB#0: ; SKX-NEXT: vpextrb $1, %xmm0, %eax ; SKX-NEXT: vpextrb $3, %xmm0, (%rdi) +; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq %r1 = extractelement <16 x i8> %x, i32 1 %r2 = extractelement <16 x i8> %x, i32 3 @@ -381,6 +655,15 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) { } define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { +; KNL-LABEL: insert_v8i64: +; KNL: ## BB#0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm1 +; KNL-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 +; KNL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: retq +; ; SKX-LABEL: insert_v8i64: ; SKX: ## BB#0: ; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 @@ -396,6 +679,15 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { } define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { +; KNL-LABEL: insert_v4i64: +; KNL: ## BB#0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; ; SKX-LABEL: insert_v4i64: ; SKX: ## BB#0: ; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 @@ -411,6 +703,12 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { } define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { +; KNL-LABEL: insert_v2i64: +; KNL: ## BB#0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0 +; KNL-NEXT: vpinsrq $3, %rdi, %xmm0, %xmm0 +; KNL-NEXT: retq +; ; SKX-LABEL: insert_v2i64: ; SKX: ## BB#0: ; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0 @@ -423,6 +721,15 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { } define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) { +; KNL-LABEL: insert_v16i32: +; KNL: ## BB#0: +; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm1 +; KNL-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 +; KNL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: retq +; ; SKX-LABEL: insert_v16i32: ; SKX: ## BB#0: ; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 @@ -570,6 +877,15 @@ define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) { } define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) { +; KNL-LABEL: insert_v32i8: +; KNL: ## BB#0: +; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; ; SKX-LABEL: insert_v32i8: ; SKX: ## BB#0: ; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 @@ -643,7 +959,7 @@ define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) { ; ; SKX-LABEL: test_insert_128_v8f64: ; SKX: ## BB#0: -; SKX-NEXT: vunpcklpd %xmm1, %xmm0, %xmm1 +; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; SKX-NEXT: vinsertf64x2 $0, %xmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %r = insertelement <8 x double> %x, double %y, i32 1 @@ -653,13 +969,13 @@ define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) { define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) { ; KNL-LABEL: test_insert_128_v16f32: ; KNL: ## BB#0: -; KNL-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm1 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] ; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_insert_128_v16f32: ; SKX: ## BB#0: -; SKX-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm1 +; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] ; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %r = insertelement <16 x float> %x, float %y, i32 1 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 41704fd8c53a..ee529bda782e 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -504,6 +504,7 @@ define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) ret i16 %res @@ -515,6 +516,7 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) ret i16 %res @@ -527,6 +529,7 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) ret i8 %res @@ -538,6 +541,7 @@ define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret i8 %res @@ -550,6 +554,7 @@ define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) ret i16 %res @@ -561,6 +566,7 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) ret i16 %res @@ -573,6 +579,7 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) ret i8 %res @@ -584,6 +591,7 @@ define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret i8 %res diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index b692bcfed955..cce3ae62fdb6 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -67,6 +67,7 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) { ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: kunpckbw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1) ret i16 %res @@ -734,6 +735,7 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) ret i16 %res @@ -745,6 +747,7 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) ret i8 %res @@ -817,6 +820,7 @@ define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) { ; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: addb %cl, %al +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1) %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m) @@ -834,6 +838,7 @@ define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) { ; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1) %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m) @@ -4897,6 +4902,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %AX ; CHECK-NEXT: retq %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) @@ -4917,6 +4923,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1 ; CHECK-NEXT: kandw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %AX ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4) @@ -4939,6 +4946,7 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 % ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %AX ; CHECK-NEXT: retq %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4) @@ -4960,6 +4968,7 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, ; CHECK-NEXT: kmovw %k0, %edx ; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: ## kill: %AL %AL %AX ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8) @@ -5548,6 +5557,7 @@ declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5569,6 +5579,7 @@ declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3] ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3] @@ -5590,6 +5601,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32 define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5611,6 +5623,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3] ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3] @@ -6340,6 +6353,7 @@ define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 ; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1) @@ -6358,6 +6372,7 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2 ; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: addb %cl, %al +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1) diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 0c94a8bc1c90..b867297df741 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -8,6 +8,7 @@ define i16 @mask16(i16 %x) { ; CHECK-NEXT: kmovw %edi, %k0 ; CHECK-NEXT: knotw %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, @@ -35,6 +36,7 @@ define i8 @mask8(i8 %x) { ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: knotw %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: ## kill: %AL %AL %EAX ; KNL-NEXT: retq ; ; SKX-LABEL: mask8: @@ -42,6 +44,7 @@ define i8 @mask8(i8 %x) { ; SKX-NEXT: kmovb %edi, %k0 ; SKX-NEXT: knotb %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, @@ -136,6 +139,7 @@ define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) { ; CHECK-NEXT: kxorw %k1, %k0, %k0 ; CHECK-NEXT: korw %k0, %k2, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %ma = load <16 x i1>, <16 x i1>* %x %mb = load <16 x i1>, <16 x i1>* %y @@ -152,6 +156,7 @@ define i8 @shuf_test1(i16 %v) nounwind { ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: ## kill: %AL %AL %EAX ; KNL-NEXT: retq ; ; SKX-LABEL: shuf_test1: @@ -159,6 +164,7 @@ define i8 @shuf_test1(i16 %v) nounwind { ; SKX-NEXT: kmovw %edi, %k0 ; SKX-NEXT: kshiftrw $8, %k0, %k0 ; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> @@ -201,6 +207,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: kshiftlw $10, %k0, %k0 ; CHECK-NEXT: kshiftrw $15, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %AX ; CHECK-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 @@ -674,6 +681,7 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { define void @test22(<4 x i1> %a, <4 x i1>* %addr) { ; KNL-LABEL: test22: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: vpslld $31, %ymm0, %ymm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -693,6 +701,7 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) { define void @test23(<2 x i1> %a, <2 x i1>* %addr) { ; KNL-LABEL: test23: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1395,6 +1404,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) { ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: load_2i1: @@ -1414,6 +1424,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: load_4i1: diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index 6a4ec4af7037..2ac91cc7482a 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -90,6 +90,7 @@ define i8 @select05_mem(<8 x i1>* %a.0, <8 x i1>* %m) { ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %mask = load <8 x i1> , <8 x i1>* %m %a = load <8 x i1> , <8 x i1>* %a.0 @@ -120,6 +121,7 @@ define i8 @select06_mem(<8 x i1>* %a.0, <8 x i1>* %m) { ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %mask = load <8 x i1> , <8 x i1>* %m %a = load <8 x i1> , <8 x i1>* %a.0 @@ -138,6 +140,7 @@ define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) { ; CHECK-NEXT: kandw %k0, %k2, %k0 ; CHECK-NEXT: korw %k0, %k1, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %mask = bitcast i8 %m to <8 x i1> %a = bitcast i8 %a.0 to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll index e4e5c2b8a1d5..35be44140026 100644 --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -53,7 +53,9 @@ define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 { define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 { ; KNL-LABEL: trunc_qb_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qb_256: @@ -67,6 +69,7 @@ define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 { define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 { ; KNL-LABEL: trunc_qb_256_mem: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovd %xmm0, (%rdi) @@ -128,7 +131,9 @@ define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 { define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 { ; KNL-LABEL: trunc_qw_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qw_256: @@ -142,6 +147,7 @@ define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 { define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 { ; KNL-LABEL: trunc_qw_256_mem: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; KNL-NEXT: vmovq %xmm0, (%rdi) @@ -203,7 +209,9 @@ define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 { define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 { ; KNL-LABEL: trunc_qd_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qd_256: @@ -217,6 +225,7 @@ define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 { define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 { ; KNL-LABEL: trunc_qd_256_mem: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vmovaps %xmm0, (%rdi) ; KNL-NEXT: retq @@ -276,7 +285,9 @@ define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 { define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 { ; KNL-LABEL: trunc_db_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_db_256: @@ -290,6 +301,7 @@ define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 { define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 { ; KNL-LABEL: trunc_db_256_mem: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovq %xmm0, (%rdi) @@ -350,7 +362,9 @@ define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 { define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 { ; KNL-LABEL: trunc_dw_256: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_dw_256: @@ -364,6 +378,7 @@ define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 { define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 { ; KNL-LABEL: trunc_dw_256_mem: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vmovaps %xmm0, (%rdi) ; KNL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll index b8afda95f1b4..6a7ed02e0311 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -124,6 +124,7 @@ define <8 x double> @_inreg8xdouble(double %a) { define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) { ; ALL-LABEL: _sd8xdouble_mask: ; ALL: # BB#0: +; ALL-NEXT: # kill: %YMM2 %YMM2 %ZMM2 ; ALL-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} @@ -139,6 +140,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { ; ALL-LABEL: _sd8xdouble_maskz: ; ALL: # BB#0: +; ALL-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} @@ -164,6 +166,7 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) { define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) { ; ALL-LABEL: _sd8xdouble_mask_load: ; ALL: # BB#0: +; ALL-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} @@ -179,6 +182,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) { ; ALL-LABEL: _sd8xdouble_maskz_load: ; ALL: # BB#0: +; ALL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; ALL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index c9ec032ba378..49aea228182a 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX @@ -111,8 +111,11 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) { define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { ; KNL-LABEL: test9: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: test9: @@ -128,8 +131,11 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { ; KNL-LABEL: test10: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: test10: @@ -160,6 +166,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { ; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 ; CHECK-NEXT: kunpckbw %k0, %k1, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = icmp eq <16 x i64> %a, %b %res1 = bitcast <16 x i1> %res to i16 @@ -167,6 +174,155 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { } define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind { +; KNL-LABEL: test12_v32i32: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $32, %rsp +; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm1 +; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: movl (%rsp), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; ; SKX-LABEL: test12_v32i32: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 @@ -180,6 +336,308 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind { } define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind { +; KNL-LABEL: test12_v64i16: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm1 +; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, (%rsp) +; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftlw $0, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl (%rsp), %ecx +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: shlq $32, %rax +; KNL-NEXT: orq %rcx, %rax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; ; SKX-LABEL: test12_v64i16: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k0 @@ -538,9 +996,12 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) nounwind { ; KNL-LABEL: test35: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vmovups (%rdi), %ymm2 ; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1 ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: test35: @@ -649,9 +1110,12 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) nounwind { ; KNL-LABEL: test41: ; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vbroadcastss (%rdi), %ymm2 ; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1 ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: test41: diff --git a/llvm/test/CodeGen/X86/avx512bw-mov.ll b/llvm/test/CodeGen/X86/avx512bw-mov.ll index 747e4f3b2dc6..c58b3cc8c3cd 100644 --- a/llvm/test/CodeGen/X86/avx512bw-mov.ll +++ b/llvm/test/CodeGen/X86/avx512bw-mov.ll @@ -105,6 +105,7 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; CHECK-NEXT: kshiftlq $48, %k0, %k0 ; CHECK-NEXT: kshiftrq $48, %k0, %k1 ; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; CHECK-NEXT: retq %res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef) ret <16 x i8> %res @@ -119,6 +120,7 @@ define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x ; CHECK-NEXT: kshiftlq $32, %k0, %k0 ; CHECK-NEXT: kshiftrq $32, %k0, %k1 ; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; CHECK-NEXT: retq %res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer) ret <32 x i8> %res @@ -133,6 +135,7 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; CHECK-NEXT: kshiftld $24, %k0, %k0 ; CHECK-NEXT: kshiftrd $24, %k0, %k1 ; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; CHECK-NEXT: retq %res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef) ret <8 x i16> %res @@ -147,6 +150,7 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; CHECK-NEXT: kshiftld $16, %k0, %k0 ; CHECK-NEXT: kshiftrd $16, %k0, %k1 ; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; CHECK-NEXT: retq %res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer) ret <16 x i16> %res @@ -156,6 +160,7 @@ declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) { ; CHECK-LABEL: test_mask_store_16xi8: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 ; CHECK-NEXT: vpmovb2m %zmm0, %k0 ; CHECK-NEXT: kshiftlq $48, %k0, %k0 @@ -170,6 +175,7 @@ declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) { ; CHECK-LABEL: test_mask_store_32xi8: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 ; CHECK-NEXT: vpmovb2m %zmm0, %k0 ; CHECK-NEXT: kshiftlq $32, %k0, %k0 @@ -184,6 +190,7 @@ declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>) define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) { ; CHECK-LABEL: test_mask_store_8xi16: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 ; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 ; CHECK-NEXT: vpmovw2m %zmm0, %k0 ; CHECK-NEXT: kshiftld $24, %k0, %k0 @@ -198,6 +205,7 @@ declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) { ; CHECK-LABEL: test_mask_store_16xi16: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 ; CHECK-NEXT: vpmovb2m %zmm0, %k0 ; CHECK-NEXT: kshiftld $16, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index a00a0b650b37..9373561ea3ae 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -415,6 +415,7 @@ define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) ret i16 %res @@ -426,6 +427,7 @@ define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) ret i16 %res @@ -461,6 +463,7 @@ define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) ret i16 %res @@ -472,6 +475,7 @@ define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) ret i16 %res diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index d05f853bee55..e1d9351b64b4 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -402,6 +402,7 @@ define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) ret i16 %res @@ -413,6 +414,7 @@ define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) ret i16 %res @@ -425,6 +427,7 @@ define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) ret i8 %res @@ -436,6 +439,7 @@ define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) ret i8 %res @@ -448,6 +452,7 @@ define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) ret i16 %res @@ -459,6 +464,7 @@ define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) ret i16 %res @@ -471,6 +477,7 @@ define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) ret i8 %res @@ -482,6 +489,7 @@ define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) ret i8 %res @@ -5219,6 +5227,7 @@ define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovb2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xc0] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0) ret i16 %res @@ -5243,6 +5252,7 @@ define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovw2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc0] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0) ret i8 %res @@ -5255,6 +5265,7 @@ define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovw2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x29,0xc0] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0) ret i16 %res @@ -5847,6 +5858,7 @@ define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x ; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) @@ -5883,6 +5895,7 @@ define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) ; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) @@ -5901,6 +5914,7 @@ define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 ; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) @@ -5919,6 +5933,7 @@ define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 % ; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1) @@ -5955,6 +5970,7 @@ define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2 ; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1) @@ -5973,6 +5989,7 @@ define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 ; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1) diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll index af14b6b0d93e..35db4901135f 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -457,6 +457,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) { ; CHECK-NEXT: vfpclasspd $4, %zmm0, %k0 ; CHECK-NEXT: kmovb %k0, %eax ; CHECK-NEXT: addb %cl, %al +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1) @@ -474,6 +475,7 @@ define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) { ; CHECK-NEXT: vfpclassps $4, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 %x1) %res1 = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 -1) @@ -503,6 +505,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) { ; CHECK-NEXT: movb $-1, %cl ; CHECK-NEXT: LBB28_4: ; CHECK-NEXT: addb %cl, %al +; CHECK-NEXT: ## kill: %AL %AL %AX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1) @@ -532,6 +535,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) { ; CHECK-NEXT: movb $-1, %cl ; CHECK-NEXT: LBB29_4: ; CHECK-NEXT: addb %cl, %al +; CHECK-NEXT: ## kill: %AL %AL %AX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1) @@ -586,6 +590,7 @@ define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovd2m %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq %res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0) ret i16 %res @@ -598,6 +603,7 @@ define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovq2m %zmm0, %k0 ; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0) ret i8 %res @@ -632,6 +638,7 @@ declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] @@ -653,6 +660,7 @@ declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1] @@ -674,6 +682,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32 define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] @@ -695,6 +704,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1] diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll index e83aa14d35e3..27c0b06d5f23 100644 --- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll @@ -7,6 +7,7 @@ define i8 @mask8(i8 %x) { ; CHECK-NEXT: kmovb %edi, %k0 ; CHECK-NEXT: knotb %k0, %k0 ; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, @@ -56,6 +57,7 @@ define i8 @mand8_mem(<8 x i1>* %x, <8 x i1>* %y) { ; CHECK-NEXT: kxorb %k1, %k0, %k0 ; CHECK-NEXT: korb %k0, %k2, %k0 ; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %ma = load <8 x i1>, <8 x i1>* %x %mb = load <8 x i1>, <8 x i1>* %y diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll index 19cffba6c43f..12f37a00675b 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -2143,6 +2143,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) { ; CHECK-NEXT: vfpclassps $4, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x66,0xc0,0x04] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1) @@ -2161,6 +2162,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) { ; CHECK-NEXT: vfpclassps $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x66,0xc0,0x04] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1) @@ -2179,6 +2181,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) { ; CHECK-NEXT: vfpclasspd $2, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x66,0xc0,0x02] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1) @@ -2197,6 +2200,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) { ; CHECK-NEXT: vfpclasspd $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x66,0xc0,0x04] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1) @@ -2274,6 +2278,7 @@ define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovd2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0) ret i8 %res @@ -2286,6 +2291,7 @@ define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovd2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x39,0xc0] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0) ret i8 %res @@ -2298,6 +2304,7 @@ define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovq2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0) ret i8 %res @@ -2310,6 +2317,7 @@ define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmovq2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0] ; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0) ret i8 %res @@ -2367,6 +2375,7 @@ declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] ; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00] ; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1] @@ -2391,6 +2400,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] ; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00] ; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 04da44cfa8b9..e546efcaca33 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -980,6 +980,7 @@ define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) ret i8 %res @@ -991,6 +992,7 @@ define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) ret i8 %res @@ -1005,6 +1007,7 @@ define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) ret i8 %res @@ -1018,6 +1021,7 @@ define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) ret i8 %res @@ -1030,6 +1034,7 @@ define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) ret i8 %res @@ -1041,6 +1046,7 @@ define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) ret i8 %res @@ -1055,6 +1061,7 @@ define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) ret i8 %res @@ -1068,6 +1075,7 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) ret i8 %res @@ -1082,6 +1090,7 @@ define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) ret i8 %res @@ -1095,6 +1104,7 @@ define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) ret i8 %res @@ -1111,6 +1121,7 @@ define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) ret i8 %res @@ -1126,6 +1137,7 @@ define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) ret i8 %res @@ -1140,6 +1152,7 @@ define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) ret i8 %res @@ -1153,6 +1166,7 @@ define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) ret i8 %res @@ -1169,6 +1183,7 @@ define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) ret i8 %res @@ -1184,6 +1199,7 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { ; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c] ; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) ret i8 %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index bc03eadcb96a..8ae290b34391 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2900,6 +2900,7 @@ define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1) ret i8 %res @@ -2911,6 +2912,7 @@ define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1) ret i8 %res @@ -2922,6 +2924,7 @@ define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1) ret i8 %res @@ -2933,6 +2936,7 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) { ; CHECK: ## BB#0: ; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1) ret i8 %res @@ -6341,6 +6345,7 @@ declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x f define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd0,0x00] ; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] @@ -6364,6 +6369,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: ; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x43,0xd0,0x00] ; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3] @@ -8107,6 +8113,7 @@ define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) ; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) @@ -8125,6 +8132,7 @@ define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) ; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) @@ -8143,6 +8151,7 @@ define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) ; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) @@ -8161,6 +8170,7 @@ define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) ; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) @@ -8179,6 +8189,7 @@ define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2 ; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1) @@ -8197,6 +8208,7 @@ define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2 ; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1) @@ -8215,6 +8227,7 @@ define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1) @@ -8233,6 +8246,7 @@ define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2 ; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8] +; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1) diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index 3b6280f16cd7..f1b325a03ebd 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -131,6 +131,8 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind { ; CHECK-NEXT: shrl $15, %edx ; CHECK-NEXT: orl %esi, %edx ; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: # kill: %DX %DX %EDX ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/bmi.ll b/llvm/test/CodeGen/X86/bmi.ll index 10b1f51e3da9..afeba4ef2d99 100644 --- a/llvm/test/CodeGen/X86/bmi.ll +++ b/llvm/test/CodeGen/X86/bmi.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s declare i8 @llvm.cttz.i8(i8, i1) @@ -12,6 +12,7 @@ define i8 @t1(i8 %x) { ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: orl $256, %eax # imm = 0x100 ; CHECK-NEXT: tzcntl %eax, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX ; CHECK-NEXT: retq %tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 false ) ret i8 %tmp @@ -59,6 +60,7 @@ define i8 @t5(i8 %x) { ; CHECK: # BB#0: ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: tzcntl %eax, %eax +; CHECK-NEXT: # kill: %AL %AL %EAX ; CHECK-NEXT: retq %tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 true ) ret i8 %tmp @@ -455,6 +457,7 @@ entry: define i64 @bzhi64b(i64 %x, i8 zeroext %index) { ; CHECK-LABEL: bzhi64b: ; CHECK: # BB#0: # %entry +; CHECK-NEXT: # kill: %ESI %ESI %RSI ; CHECK-NEXT: bzhiq %rsi, %rdi, %rax ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll index 8822515e0d50..9fcaa4c4fe5a 100644 --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -14,6 +14,7 @@ define i8 @cttz_i8(i8 %x) { ; CHECK: # BB#0: ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: bsfl %eax, %eax +; CHECK-NEXT: # kill ; CHECK-NEXT: retq %tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true ) ret i8 %tmp @@ -52,6 +53,7 @@ define i8 @ctlz_i8(i8 %x) { ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: bsrl %eax, %eax ; CHECK-NEXT: xorl $7, %eax +; CHECK-NEXT: # kill ; CHECK-NEXT: retq %tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true ) ret i8 %tmp2 @@ -62,6 +64,7 @@ define i16 @ctlz_i16(i16 %x) { ; CHECK: # BB#0: ; CHECK-NEXT: bsrw %di, %ax ; CHECK-NEXT: xorl $15, %eax +; CHECK-NEXT: # kill ; CHECK-NEXT: retq %tmp2 = call i16 @llvm.ctlz.i16( i16 %x, i1 true ) ret i16 %tmp2 @@ -100,6 +103,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; CHECK-NEXT: bsrl %eax, %eax ; CHECK-NEXT: xorl $7, %eax ; CHECK-NEXT: .LBB8_2: # %cond.end +; CHECK-NEXT: # kill ; CHECK-NEXT: retq %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false) ret i8 %tmp1 @@ -117,6 +121,7 @@ define i16 @ctlz_i16_zero_test(i16 %n) { ; CHECK-NEXT: bsrw %di, %ax ; CHECK-NEXT: xorl $15, %eax ; CHECK-NEXT: .LBB9_2: # %cond.end +; CHECK-NEXT: # kill ; CHECK-NEXT: retq %tmp1 = call i16 @llvm.ctlz.i16(i16 %n, i1 false) ret i16 %tmp1 @@ -168,6 +173,7 @@ define i8 @cttz_i8_zero_test(i8 %n) { ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: bsfl %eax, %eax ; CHECK-NEXT: .LBB12_2: # %cond.end +; CHECK-NEXT: # kill ; CHECK-NEXT: retq %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false) ret i8 %tmp1 diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll index 1f6fdda46b53..eb7cdb6b57be 100644 --- a/llvm/test/CodeGen/X86/extractelement-index.ll +++ b/llvm/test/CodeGen/X86/extractelement-index.ll @@ -18,11 +18,13 @@ define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind { ; SSE41-LABEL: extractelement_v16i8_1: ; SSE41: # BB#0: ; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: # kill: %AL %AL %EAX ; SSE41-NEXT: retq ; ; AVX-LABEL: extractelement_v16i8_1: ; AVX: # BB#0: ; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq %b = extractelement <16 x i8> %a, i256 1 ret i8 %b @@ -38,11 +40,13 @@ define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind { ; SSE41-LABEL: extractelement_v16i8_11: ; SSE41: # BB#0: ; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: # kill: %AL %AL %EAX ; SSE41-NEXT: retq ; ; AVX-LABEL: extractelement_v16i8_11: ; AVX: # BB#0: ; AVX-NEXT: vpextrb $11, %xmm0, %eax +; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq %b = extractelement <16 x i8> %a, i256 11 ret i8 %b @@ -58,11 +62,13 @@ define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind { ; SSE41-LABEL: extractelement_v16i8_14: ; SSE41: # BB#0: ; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: # kill: %AL %AL %EAX ; SSE41-NEXT: retq ; ; AVX-LABEL: extractelement_v16i8_14: ; AVX: # BB#0: ; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq %b = extractelement <16 x i8> %a, i256 14 ret i8 %b @@ -78,11 +84,13 @@ define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind { ; SSE41-LABEL: extractelement_v32i8_1: ; SSE41: # BB#0: ; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: # kill: %AL %AL %EAX ; SSE41-NEXT: retq ; ; AVX-LABEL: extractelement_v32i8_1: ; AVX: # BB#0: ; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %b = extractelement <32 x i8> %a, i256 1 @@ -99,12 +107,14 @@ define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind { ; SSE41-LABEL: extractelement_v32i8_17: ; SSE41: # BB#0: ; SSE41-NEXT: pextrb $1, %xmm1, %eax +; SSE41-NEXT: # kill: %AL %AL %EAX ; SSE41-NEXT: retq ; ; AVX1-LABEL: extractelement_v32i8_17: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: # kill: %AL %AL %EAX ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -112,6 +122,7 @@ define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind { ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: # kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %b = extractelement <32 x i8> %a, i256 17 @@ -122,11 +133,13 @@ define i16 @extractelement_v8i16_0(<8 x i16> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v8i16_0: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v8i16_0: ; AVX: # BB#0: ; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: %AX %AX %EAX ; AVX-NEXT: retq %b = extractelement <8 x i16> %a, i256 0 ret i16 %b @@ -136,11 +149,13 @@ define i16 @extractelement_v8i16_3(<8 x i16> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v8i16_3: ; SSE: # BB#0: ; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v8i16_3: ; AVX: # BB#0: ; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: # kill: %AX %AX %EAX ; AVX-NEXT: retq %b = extractelement <8 x i16> %a, i256 3 ret i16 %b @@ -150,11 +165,13 @@ define i16 @extractelement_v16i16_0(<16 x i16> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v16i16_0: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v16i16_0: ; AVX: # BB#0: ; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: %AX %AX %EAX ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %b = extractelement <16 x i16> %a, i256 0 @@ -165,12 +182,14 @@ define i16 @extractelement_v16i16_13(<16 x i16> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v16i16_13: ; SSE: # BB#0: ; SSE-NEXT: pextrw $5, %xmm1, %eax +; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; ; AVX1-LABEL: extractelement_v16i16_13: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrw $5, %xmm0, %eax +; AVX1-NEXT: # kill: %AX %AX %EAX ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -178,6 +197,7 @@ define i16 @extractelement_v16i16_13(<16 x i16> %a, i256 %i) nounwind { ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrw $5, %xmm0, %eax +; AVX2-NEXT: # kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %b = extractelement <16 x i16> %a, i256 13 diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll index a65f5eeb8ae1..6b7d39548385 100644 --- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll @@ -43,6 +43,7 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X32-NEXT: vmovd %xmm0, %eax +; X32-NEXT: # kill: %AX %AX %EAX ; X32-NEXT: retl ; ; X64-LABEL: test_cvtss_sh: @@ -51,6 +52,7 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax +; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %ins0 = insertelement <4 x float> undef, float %a0, i32 0 %ins1 = insertelement <4 x float> %ins0, float 0.000000e+00, i32 1 diff --git a/llvm/test/CodeGen/X86/fixup-bw-copy.ll b/llvm/test/CodeGen/X86/fixup-bw-copy.ll index 9528253ef5d0..9067dfd29c17 100644 --- a/llvm/test/CodeGen/X86/fixup-bw-copy.ll +++ b/llvm/test/CodeGen/X86/fixup-bw-copy.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=1 -mtriple=x86_64-- < %s | FileCheck --check-prefix=X64 --check-prefix=BWON64 %s ; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=0 -mtriple=x86_64-- < %s | FileCheck --check-prefix=X64 --check-prefix=BWOFF64 %s ; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=1 -mtriple=i386-- < %s | FileCheck --check-prefix=X32 --check-prefix=BWON32 %s @@ -54,6 +54,7 @@ define i8 @test_movb_hreg(i16 %a0) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shrl $8, %eax ; X64-NEXT: addb %dil, %al +; X64-NEXT: # kill: %AL %AL %EAX ; X64-NEXT: retq ; ; X32-LABEL: test_movb_hreg: diff --git a/llvm/test/CodeGen/X86/h-registers-3.ll b/llvm/test/CodeGen/X86/h-registers-3.ll index ca7c7a7d1705..819f21625abf 100644 --- a/llvm/test/CodeGen/X86/h-registers-3.ll +++ b/llvm/test/CodeGen/X86/h-registers-3.ll @@ -17,13 +17,17 @@ entry: ; X64-LABEL: foo ; X64: callq +; X64-NEXT: # kill ; X64-NEXT: shrl $8, %eax +; X64-NEXT: # kill ; X64-NEXT: popq ; X64-NEXT: retq ; X32-LABEL: foo ; X32: callq +; X32-NEXT: # kill ; X32-NEXT: shrl $8, %eax +; X32-NEXT: # kill ; X32-NEXT: popq ; X32-NEXT: retq } diff --git a/llvm/test/CodeGen/X86/machine-combiner-int.ll b/llvm/test/CodeGen/X86/machine-combiner-int.ll index 47a83597f2dd..7951a92c5246 100644 --- a/llvm/test/CodeGen/X86/machine-combiner-int.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int.ll @@ -10,9 +10,12 @@ define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) { ; CHECK-LABEL: reassociate_muls_i16: ; CHECK: # BB#0: +; CHECK-NEXT: # kill +; CHECK-NEXT: # kill ; CHECK-NEXT: leal (%rdi,%rsi), %eax ; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: imull %edx, %eax +; CHECK-NEXT: # kill ; CHECK-NEXT: retq %t0 = add i16 %x0, %x1 %t1 = mul i16 %x2, %t0 @@ -23,6 +26,8 @@ define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) { define i32 @reassociate_muls_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { ; CHECK-LABEL: reassociate_muls_i32: ; CHECK: # BB#0: +; CHECK-NEXT: # kill +; CHECK-NEXT: # kill ; CHECK-NEXT: leal (%rdi,%rsi), %eax ; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: imull %edx, %eax diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index a8cf982323f6..04505a3e4b55 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX @@ -676,23 +675,25 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; ; KNL_64-LABEL: test15: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; KNL_64: vpxor %ymm2, %ymm2, %ymm2 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} +; KNL_64-NEXT: # kill ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test15: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; KNL_32: vpxor %ymm2, %ymm2, %ymm2 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} +; KNL_32-NEXT: # kill ; KNL_32-NEXT: retl ; ; SKX-LABEL: test15: @@ -723,7 +724,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; ; KNL_64-LABEL: test16: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_64: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 @@ -737,7 +738,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; ; KNL_32-LABEL: test16: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 @@ -777,7 +778,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; ; KNL_64-LABEL: test17: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -787,7 +788,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; ; KNL_32-LABEL: test17: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1 @@ -829,7 +830,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; ; KNL_64-LABEL: test18: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_64: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 @@ -838,7 +839,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; ; KNL_32-LABEL: test18: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_32: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 @@ -867,7 +868,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; ; KNL_64-LABEL: test19: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_64: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 @@ -879,7 +880,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; ; KNL_32-LABEL: test19: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 @@ -914,7 +915,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; ; KNL_64-LABEL: test20: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL_64: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -925,7 +926,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; ; KNL_32-LABEL: test20: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL_32: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -938,7 +939,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; ; SKX-LABEL: test20: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 +; SKX: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX-NEXT: kshiftlb $6, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 @@ -963,7 +964,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; ; KNL_64-LABEL: test21: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 @@ -973,7 +974,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; ; KNL_32-LABEL: test21: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2 @@ -983,7 +984,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; ; SKX-LABEL: test21: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 +; SKX: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX-NEXT: kshiftlb $6, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 @@ -993,7 +994,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; ; SKX_32-LABEL: test21: ; SKX_32: # BB#0: -; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 +; SKX_32: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1 @@ -1012,7 +1013,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; ; KNL_64-LABEL: test22: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_64: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1026,7 +1027,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; ; KNL_32-LABEL: test22: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1074,7 +1075,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; ; KNL_64-LABEL: test23: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -1084,7 +1085,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; ; KNL_32-LABEL: test23: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1 @@ -1118,7 +1119,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # BB#0: -; KNL_64-NEXT: movb $3, %al +; KNL_64: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 @@ -1126,7 +1127,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; ; KNL_32-LABEL: test24: ; KNL_32: # BB#0: -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 ; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1 @@ -1159,7 +1160,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; ; KNL_64-LABEL: test25: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64: vpxord %zmm3, %zmm3, %zmm3 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -1169,7 +1170,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; ; KNL_32-LABEL: test25: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32: vpxord %zmm3, %zmm3, %zmm3 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1 @@ -1204,7 +1205,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; ; KNL_64-LABEL: test26: ; KNL_64: # BB#0: -; KNL_64-NEXT: movb $3, %al +; KNL_64: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovaps %zmm1, %zmm0 @@ -1212,7 +1213,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; ; KNL_32-LABEL: test26: ; KNL_32: # BB#0: -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 ; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2 @@ -1251,6 +1252,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; KNL_64-NEXT: # kill ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: @@ -1261,6 +1263,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_32-NEXT: movb $3, %cl ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; KNL_32-NEXT: # kill ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: @@ -1282,7 +1285,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; ; KNL_64-LABEL: test28: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -1290,7 +1293,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; ; KNL_32-LABEL: test28: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 ; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2 @@ -1300,7 +1303,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; ; SKX-LABEL: test28: ; SKX: # BB#0: -; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} @@ -1308,7 +1311,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; ; SKX_32-LABEL: test28: ; SKX_32: # BB#0: -; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: movb $3, %al ; SKX_32-NEXT: kmovb %eax, %k1 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 7fa8aba09720..c31b8381aebd 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F @@ -196,6 +196,7 @@ define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> ; ; AVX512F-LABEL: test5: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} @@ -446,6 +447,8 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> ; ; AVX512F-LABEL: test11a: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 @@ -491,6 +494,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) { ; ; AVX512F-LABEL: test11b: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -540,6 +544,7 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) { ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test11c: @@ -581,6 +586,7 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) { ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test11d: @@ -615,6 +621,8 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { ; ; AVX512F-LABEL: test12: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 @@ -1007,9 +1015,11 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) ; ; AVX512F-LABEL: mload_constmask_v8f32: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: movw $7, %ax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: mload_constmask_v8f32: @@ -1062,9 +1072,11 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) { ; ; AVX512F-LABEL: mload_constmask_v8i32: ; AVX512F: ## BB#0: +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: movw $135, %ax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} +; AVX512F-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: mload_constmask_v8i32: @@ -2221,6 +2233,251 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) { +; AVX-LABEL: test_mask_load_16xi8: +; AVX: ## BB#0: +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: ## implicit-def: %XMM1 +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_2 +; AVX-NEXT: ## BB#1: ## %cond.load +; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: LBB50_2: ## %else +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_4 +; AVX-NEXT: ## BB#3: ## %cond.load1 +; AVX-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_4: ## %else2 +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_6 +; AVX-NEXT: ## BB#5: ## %cond.load4 +; AVX-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_6: ## %else5 +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_8 +; AVX-NEXT: ## BB#7: ## %cond.load7 +; AVX-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_8: ## %else8 +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_10 +; AVX-NEXT: ## BB#9: ## %cond.load10 +; AVX-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_10: ## %else11 +; AVX-NEXT: vpextrb $5, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_12 +; AVX-NEXT: ## BB#11: ## %cond.load13 +; AVX-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_12: ## %else14 +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_14 +; AVX-NEXT: ## BB#13: ## %cond.load16 +; AVX-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_14: ## %else17 +; AVX-NEXT: vpextrb $7, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_16 +; AVX-NEXT: ## BB#15: ## %cond.load19 +; AVX-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_16: ## %else20 +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_18 +; AVX-NEXT: ## BB#17: ## %cond.load22 +; AVX-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_18: ## %else23 +; AVX-NEXT: vpextrb $9, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_20 +; AVX-NEXT: ## BB#19: ## %cond.load25 +; AVX-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_20: ## %else26 +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_22 +; AVX-NEXT: ## BB#21: ## %cond.load28 +; AVX-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_22: ## %else29 +; AVX-NEXT: vpextrb $11, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_24 +; AVX-NEXT: ## BB#23: ## %cond.load31 +; AVX-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_24: ## %else32 +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_26 +; AVX-NEXT: ## BB#25: ## %cond.load34 +; AVX-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_26: ## %else35 +; AVX-NEXT: vpextrb $13, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_28 +; AVX-NEXT: ## BB#27: ## %cond.load37 +; AVX-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_28: ## %else38 +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_30 +; AVX-NEXT: ## BB#29: ## %cond.load40 +; AVX-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_30: ## %else41 +; AVX-NEXT: vpextrb $15, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB50_32 +; AVX-NEXT: ## BB#31: ## %cond.load43 +; AVX-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB50_32: ## %else44 +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test_mask_load_16xi8: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: ## implicit-def: %XMM0 +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_2 +; AVX512F-NEXT: ## BB#1: ## %cond.load +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: LBB50_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_4 +; AVX512F-NEXT: ## BB#3: ## %cond.load1 +; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_6 +; AVX512F-NEXT: ## BB#5: ## %cond.load4 +; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_6: ## %else5 +; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_8 +; AVX512F-NEXT: ## BB#7: ## %cond.load7 +; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_8: ## %else8 +; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_10 +; AVX512F-NEXT: ## BB#9: ## %cond.load10 +; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_10: ## %else11 +; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_12 +; AVX512F-NEXT: ## BB#11: ## %cond.load13 +; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_12: ## %else14 +; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_14 +; AVX512F-NEXT: ## BB#13: ## %cond.load16 +; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_14: ## %else17 +; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_16 +; AVX512F-NEXT: ## BB#15: ## %cond.load19 +; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_16: ## %else20 +; AVX512F-NEXT: kshiftlw $7, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_18 +; AVX512F-NEXT: ## BB#17: ## %cond.load22 +; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_18: ## %else23 +; AVX512F-NEXT: kshiftlw $6, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_20 +; AVX512F-NEXT: ## BB#19: ## %cond.load25 +; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_20: ## %else26 +; AVX512F-NEXT: kshiftlw $5, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_22 +; AVX512F-NEXT: ## BB#21: ## %cond.load28 +; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_22: ## %else29 +; AVX512F-NEXT: kshiftlw $4, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_24 +; AVX512F-NEXT: ## BB#23: ## %cond.load31 +; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_24: ## %else32 +; AVX512F-NEXT: kshiftlw $3, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_26 +; AVX512F-NEXT: ## BB#25: ## %cond.load34 +; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_26: ## %else35 +; AVX512F-NEXT: kshiftlw $2, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_28 +; AVX512F-NEXT: ## BB#27: ## %cond.load37 +; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_28: ## %else38 +; AVX512F-NEXT: kshiftlw $1, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_30 +; AVX512F-NEXT: ## BB#29: ## %cond.load40 +; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_30: ## %else41 +; AVX512F-NEXT: kshiftlw $0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB50_32 +; AVX512F-NEXT: ## BB#31: ## %cond.load43 +; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB50_32: ## %else44 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_load_16xi8: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -2233,6 +2490,764 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) { +; AVX1-LABEL: test_mask_load_32xi8: +; AVX1: ## BB#0: +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: ## implicit-def: %YMM1 +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_2 +; AVX1-NEXT: ## BB#1: ## %cond.load +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: LBB51_2: ## %else +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_4 +; AVX1-NEXT: ## BB#3: ## %cond.load1 +; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_4: ## %else2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_6 +; AVX1-NEXT: ## BB#5: ## %cond.load4 +; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_6: ## %else5 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_8 +; AVX1-NEXT: ## BB#7: ## %cond.load7 +; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_8: ## %else8 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_10 +; AVX1-NEXT: ## BB#9: ## %cond.load10 +; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_10: ## %else11 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_12 +; AVX1-NEXT: ## BB#11: ## %cond.load13 +; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_12: ## %else14 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_14 +; AVX1-NEXT: ## BB#13: ## %cond.load16 +; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_14: ## %else17 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_16 +; AVX1-NEXT: ## BB#15: ## %cond.load19 +; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_16: ## %else20 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_18 +; AVX1-NEXT: ## BB#17: ## %cond.load22 +; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_18: ## %else23 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_20 +; AVX1-NEXT: ## BB#19: ## %cond.load25 +; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_20: ## %else26 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_22 +; AVX1-NEXT: ## BB#21: ## %cond.load28 +; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_22: ## %else29 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_24 +; AVX1-NEXT: ## BB#23: ## %cond.load31 +; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_24: ## %else32 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_26 +; AVX1-NEXT: ## BB#25: ## %cond.load34 +; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_26: ## %else35 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_28 +; AVX1-NEXT: ## BB#27: ## %cond.load37 +; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_28: ## %else38 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_30 +; AVX1-NEXT: ## BB#29: ## %cond.load40 +; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_30: ## %else41 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_32 +; AVX1-NEXT: ## BB#31: ## %cond.load43 +; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB51_32: ## %else44 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_34 +; AVX1-NEXT: ## BB#33: ## %cond.load46 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_34: ## %else47 +; AVX1-NEXT: vpextrb $1, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_36 +; AVX1-NEXT: ## BB#35: ## %cond.load49 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_36: ## %else50 +; AVX1-NEXT: vpextrb $2, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_38 +; AVX1-NEXT: ## BB#37: ## %cond.load52 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_38: ## %else53 +; AVX1-NEXT: vpextrb $3, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_40 +; AVX1-NEXT: ## BB#39: ## %cond.load55 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_40: ## %else56 +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_42 +; AVX1-NEXT: ## BB#41: ## %cond.load58 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_42: ## %else59 +; AVX1-NEXT: vpextrb $5, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_44 +; AVX1-NEXT: ## BB#43: ## %cond.load61 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_44: ## %else62 +; AVX1-NEXT: vpextrb $6, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_46 +; AVX1-NEXT: ## BB#45: ## %cond.load64 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_46: ## %else65 +; AVX1-NEXT: vpextrb $7, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_48 +; AVX1-NEXT: ## BB#47: ## %cond.load67 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_48: ## %else68 +; AVX1-NEXT: vpextrb $8, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_50 +; AVX1-NEXT: ## BB#49: ## %cond.load70 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_50: ## %else71 +; AVX1-NEXT: vpextrb $9, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_52 +; AVX1-NEXT: ## BB#51: ## %cond.load73 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_52: ## %else74 +; AVX1-NEXT: vpextrb $10, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_54 +; AVX1-NEXT: ## BB#53: ## %cond.load76 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_54: ## %else77 +; AVX1-NEXT: vpextrb $11, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_56 +; AVX1-NEXT: ## BB#55: ## %cond.load79 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_56: ## %else80 +; AVX1-NEXT: vpextrb $12, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_58 +; AVX1-NEXT: ## BB#57: ## %cond.load82 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_58: ## %else83 +; AVX1-NEXT: vpextrb $13, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_60 +; AVX1-NEXT: ## BB#59: ## %cond.load85 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_60: ## %else86 +; AVX1-NEXT: vpextrb $14, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_62 +; AVX1-NEXT: ## BB#61: ## %cond.load88 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_62: ## %else89 +; AVX1-NEXT: vpextrb $15, %xmm2, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB51_64 +; AVX1-NEXT: ## BB#63: ## %cond.load91 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: LBB51_64: ## %else92 +; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_load_32xi8: +; AVX2: ## BB#0: +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: ## implicit-def: %YMM1 +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_2 +; AVX2-NEXT: ## BB#1: ## %cond.load +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: LBB51_2: ## %else +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_4 +; AVX2-NEXT: ## BB#3: ## %cond.load1 +; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_4: ## %else2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_6 +; AVX2-NEXT: ## BB#5: ## %cond.load4 +; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_6: ## %else5 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_8 +; AVX2-NEXT: ## BB#7: ## %cond.load7 +; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_8: ## %else8 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_10 +; AVX2-NEXT: ## BB#9: ## %cond.load10 +; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_10: ## %else11 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_12 +; AVX2-NEXT: ## BB#11: ## %cond.load13 +; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_12: ## %else14 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_14 +; AVX2-NEXT: ## BB#13: ## %cond.load16 +; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_14: ## %else17 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_16 +; AVX2-NEXT: ## BB#15: ## %cond.load19 +; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_16: ## %else20 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_18 +; AVX2-NEXT: ## BB#17: ## %cond.load22 +; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_18: ## %else23 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_20 +; AVX2-NEXT: ## BB#19: ## %cond.load25 +; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_20: ## %else26 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_22 +; AVX2-NEXT: ## BB#21: ## %cond.load28 +; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_22: ## %else29 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_24 +; AVX2-NEXT: ## BB#23: ## %cond.load31 +; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_24: ## %else32 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_26 +; AVX2-NEXT: ## BB#25: ## %cond.load34 +; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_26: ## %else35 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_28 +; AVX2-NEXT: ## BB#27: ## %cond.load37 +; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_28: ## %else38 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_30 +; AVX2-NEXT: ## BB#29: ## %cond.load40 +; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_30: ## %else41 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_32 +; AVX2-NEXT: ## BB#31: ## %cond.load43 +; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB51_32: ## %else44 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_34 +; AVX2-NEXT: ## BB#33: ## %cond.load46 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_34: ## %else47 +; AVX2-NEXT: vpextrb $1, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_36 +; AVX2-NEXT: ## BB#35: ## %cond.load49 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_36: ## %else50 +; AVX2-NEXT: vpextrb $2, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_38 +; AVX2-NEXT: ## BB#37: ## %cond.load52 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_38: ## %else53 +; AVX2-NEXT: vpextrb $3, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_40 +; AVX2-NEXT: ## BB#39: ## %cond.load55 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_40: ## %else56 +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_42 +; AVX2-NEXT: ## BB#41: ## %cond.load58 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_42: ## %else59 +; AVX2-NEXT: vpextrb $5, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_44 +; AVX2-NEXT: ## BB#43: ## %cond.load61 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_44: ## %else62 +; AVX2-NEXT: vpextrb $6, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_46 +; AVX2-NEXT: ## BB#45: ## %cond.load64 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_46: ## %else65 +; AVX2-NEXT: vpextrb $7, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_48 +; AVX2-NEXT: ## BB#47: ## %cond.load67 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_48: ## %else68 +; AVX2-NEXT: vpextrb $8, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_50 +; AVX2-NEXT: ## BB#49: ## %cond.load70 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_50: ## %else71 +; AVX2-NEXT: vpextrb $9, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_52 +; AVX2-NEXT: ## BB#51: ## %cond.load73 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_52: ## %else74 +; AVX2-NEXT: vpextrb $10, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_54 +; AVX2-NEXT: ## BB#53: ## %cond.load76 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_54: ## %else77 +; AVX2-NEXT: vpextrb $11, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_56 +; AVX2-NEXT: ## BB#55: ## %cond.load79 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_56: ## %else80 +; AVX2-NEXT: vpextrb $12, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_58 +; AVX2-NEXT: ## BB#57: ## %cond.load82 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_58: ## %else83 +; AVX2-NEXT: vpextrb $13, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_60 +; AVX2-NEXT: ## BB#59: ## %cond.load85 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_60: ## %else86 +; AVX2-NEXT: vpextrb $14, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_62 +; AVX2-NEXT: ## BB#61: ## %cond.load88 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_62: ## %else89 +; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB51_64 +; AVX2-NEXT: ## BB#63: ## %cond.load91 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB51_64: ## %else92 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_load_32xi8: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpextrb $0, %xmm0, %eax +; AVX512F-NEXT: ## implicit-def: %YMM1 +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_2 +; AVX512F-NEXT: ## BB#1: ## %cond.load +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: LBB51_2: ## %else +; AVX512F-NEXT: vpextrb $1, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_4 +; AVX512F-NEXT: ## BB#3: ## %cond.load1 +; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_4: ## %else2 +; AVX512F-NEXT: vpextrb $2, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_6 +; AVX512F-NEXT: ## BB#5: ## %cond.load4 +; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_6: ## %else5 +; AVX512F-NEXT: vpextrb $3, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_8 +; AVX512F-NEXT: ## BB#7: ## %cond.load7 +; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_8: ## %else8 +; AVX512F-NEXT: vpextrb $4, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_10 +; AVX512F-NEXT: ## BB#9: ## %cond.load10 +; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_10: ## %else11 +; AVX512F-NEXT: vpextrb $5, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_12 +; AVX512F-NEXT: ## BB#11: ## %cond.load13 +; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_12: ## %else14 +; AVX512F-NEXT: vpextrb $6, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_14 +; AVX512F-NEXT: ## BB#13: ## %cond.load16 +; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_14: ## %else17 +; AVX512F-NEXT: vpextrb $7, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_16 +; AVX512F-NEXT: ## BB#15: ## %cond.load19 +; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_16: ## %else20 +; AVX512F-NEXT: vpextrb $8, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_18 +; AVX512F-NEXT: ## BB#17: ## %cond.load22 +; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_18: ## %else23 +; AVX512F-NEXT: vpextrb $9, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_20 +; AVX512F-NEXT: ## BB#19: ## %cond.load25 +; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_20: ## %else26 +; AVX512F-NEXT: vpextrb $10, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_22 +; AVX512F-NEXT: ## BB#21: ## %cond.load28 +; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_22: ## %else29 +; AVX512F-NEXT: vpextrb $11, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_24 +; AVX512F-NEXT: ## BB#23: ## %cond.load31 +; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_24: ## %else32 +; AVX512F-NEXT: vpextrb $12, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_26 +; AVX512F-NEXT: ## BB#25: ## %cond.load34 +; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_26: ## %else35 +; AVX512F-NEXT: vpextrb $13, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_28 +; AVX512F-NEXT: ## BB#27: ## %cond.load37 +; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_28: ## %else38 +; AVX512F-NEXT: vpextrb $14, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_30 +; AVX512F-NEXT: ## BB#29: ## %cond.load40 +; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_30: ## %else41 +; AVX512F-NEXT: vpextrb $15, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_32 +; AVX512F-NEXT: ## BB#31: ## %cond.load43 +; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB51_32: ## %else44 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpextrb $0, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_34 +; AVX512F-NEXT: ## BB#33: ## %cond.load46 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_34: ## %else47 +; AVX512F-NEXT: vpextrb $1, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_36 +; AVX512F-NEXT: ## BB#35: ## %cond.load49 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_36: ## %else50 +; AVX512F-NEXT: vpextrb $2, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_38 +; AVX512F-NEXT: ## BB#37: ## %cond.load52 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_38: ## %else53 +; AVX512F-NEXT: vpextrb $3, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_40 +; AVX512F-NEXT: ## BB#39: ## %cond.load55 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_40: ## %else56 +; AVX512F-NEXT: vpextrb $4, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_42 +; AVX512F-NEXT: ## BB#41: ## %cond.load58 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_42: ## %else59 +; AVX512F-NEXT: vpextrb $5, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_44 +; AVX512F-NEXT: ## BB#43: ## %cond.load61 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_44: ## %else62 +; AVX512F-NEXT: vpextrb $6, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_46 +; AVX512F-NEXT: ## BB#45: ## %cond.load64 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_46: ## %else65 +; AVX512F-NEXT: vpextrb $7, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_48 +; AVX512F-NEXT: ## BB#47: ## %cond.load67 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_48: ## %else68 +; AVX512F-NEXT: vpextrb $8, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_50 +; AVX512F-NEXT: ## BB#49: ## %cond.load70 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_50: ## %else71 +; AVX512F-NEXT: vpextrb $9, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_52 +; AVX512F-NEXT: ## BB#51: ## %cond.load73 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_52: ## %else74 +; AVX512F-NEXT: vpextrb $10, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_54 +; AVX512F-NEXT: ## BB#53: ## %cond.load76 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_54: ## %else77 +; AVX512F-NEXT: vpextrb $11, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_56 +; AVX512F-NEXT: ## BB#55: ## %cond.load79 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_56: ## %else80 +; AVX512F-NEXT: vpextrb $12, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_58 +; AVX512F-NEXT: ## BB#57: ## %cond.load82 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_58: ## %else83 +; AVX512F-NEXT: vpextrb $13, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_60 +; AVX512F-NEXT: ## BB#59: ## %cond.load85 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_60: ## %else86 +; AVX512F-NEXT: vpextrb $14, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_62 +; AVX512F-NEXT: ## BB#61: ## %cond.load88 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_62: ## %else89 +; AVX512F-NEXT: vpextrb $15, %xmm2, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB51_64 +; AVX512F-NEXT: ## BB#63: ## %cond.load91 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB51_64: ## %else92 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_load_32xi8: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -2245,6 +3260,2278 @@ define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>) define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) { +; AVX1-LABEL: test_mask_load_64xi8: +; AVX1: ## BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: Ltmp3: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: Ltmp4: +; AVX1-NEXT: .cfi_def_cfa_offset 24 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: Ltmp5: +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: Ltmp6: +; AVX1-NEXT: .cfi_def_cfa_offset 40 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: Ltmp7: +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: Ltmp8: +; AVX1-NEXT: .cfi_def_cfa_offset 56 +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: Ltmp9: +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: Ltmp10: +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: Ltmp11: +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: Ltmp12: +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: Ltmp13: +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: Ltmp14: +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: Ltmp15: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movl %edi, %r13d +; AVX1-NEXT: testb $1, %dil +; AVX1-NEXT: je LBB52_2 +; AVX1-NEXT: ## BB#1: ## %cond.load +; AVX1-NEXT: movzbl (%rax), %ebp +; AVX1-NEXT: vmovd %ebp, %xmm9 +; AVX1-NEXT: LBB52_2: ## %else +; AVX1-NEXT: testb $1, %sil +; AVX1-NEXT: je LBB52_4 +; AVX1-NEXT: ## BB#3: ## %cond.load1 +; AVX1-NEXT: vpinsrb $1, 1(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_4: ## %else2 +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB52_6 +; AVX1-NEXT: ## BB#5: ## %cond.load4 +; AVX1-NEXT: vpinsrb $2, 2(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_6: ## %else5 +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB52_8 +; AVX1-NEXT: ## BB#7: ## %cond.load7 +; AVX1-NEXT: vpinsrb $3, 3(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_8: ## %else8 +; AVX1-NEXT: testb $1, %r8b +; AVX1-NEXT: je LBB52_10 +; AVX1-NEXT: ## BB#9: ## %cond.load10 +; AVX1-NEXT: vpinsrb $4, 4(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_10: ## %else11 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; AVX1-NEXT: testb $1, %r9b +; AVX1-NEXT: je LBB52_12 +; AVX1-NEXT: ## BB#11: ## %cond.load13 +; AVX1-NEXT: vpinsrb $5, 5(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_12: ## %else14 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; AVX1-NEXT: testb $1, %r10b +; AVX1-NEXT: je LBB52_14 +; AVX1-NEXT: ## BB#13: ## %cond.load16 +; AVX1-NEXT: vpinsrb $6, 6(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_14: ## %else17 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; AVX1-NEXT: testb $1, %r11b +; AVX1-NEXT: je LBB52_16 +; AVX1-NEXT: ## BB#15: ## %cond.load19 +; AVX1-NEXT: vpinsrb $7, 7(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_16: ## %else20 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; AVX1-NEXT: testb $1, %r14b +; AVX1-NEXT: je LBB52_18 +; AVX1-NEXT: ## BB#17: ## %cond.load22 +; AVX1-NEXT: vpinsrb $8, 8(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_18: ## %else23 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; AVX1-NEXT: testb $1, %r15b +; AVX1-NEXT: je LBB52_20 +; AVX1-NEXT: ## BB#19: ## %cond.load25 +; AVX1-NEXT: vpinsrb $9, 9(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_20: ## %else26 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dil +; AVX1-NEXT: testb $1, %r12b +; AVX1-NEXT: je LBB52_22 +; AVX1-NEXT: ## BB#21: ## %cond.load28 +; AVX1-NEXT: vpinsrb $10, 10(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_22: ## %else29 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; AVX1-NEXT: testb $1, %dil +; AVX1-NEXT: je LBB52_24 +; AVX1-NEXT: ## BB#23: ## %cond.load31 +; AVX1-NEXT: vpinsrb $11, 11(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_24: ## %else32 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; AVX1-NEXT: testb $1, %bpl +; AVX1-NEXT: je LBB52_26 +; AVX1-NEXT: ## BB#25: ## %cond.load34 +; AVX1-NEXT: vpinsrb $12, 12(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_26: ## %else35 +; AVX1-NEXT: testb $1, %bl +; AVX1-NEXT: je LBB52_28 +; AVX1-NEXT: ## BB#27: ## %cond.load37 +; AVX1-NEXT: vpinsrb $13, 13(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_28: ## %else38 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_30 +; AVX1-NEXT: ## BB#29: ## %cond.load40 +; AVX1-NEXT: vpinsrb $14, 14(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_30: ## %else41 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_32 +; AVX1-NEXT: ## BB#31: ## %cond.load43 +; AVX1-NEXT: vpinsrb $15, 15(%rax), %xmm9, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: LBB52_32: ## %else44 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_34 +; AVX1-NEXT: ## BB#33: ## %cond.load46 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_34: ## %else47 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_36 +; AVX1-NEXT: ## BB#35: ## %cond.load49 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_36: ## %else50 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_38 +; AVX1-NEXT: ## BB#37: ## %cond.load52 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_38: ## %else53 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_40 +; AVX1-NEXT: ## BB#39: ## %cond.load55 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_40: ## %else56 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_42 +; AVX1-NEXT: ## BB#41: ## %cond.load58 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_42: ## %else59 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_44 +; AVX1-NEXT: ## BB#43: ## %cond.load61 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_44: ## %else62 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_46 +; AVX1-NEXT: ## BB#45: ## %cond.load64 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_46: ## %else65 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_48 +; AVX1-NEXT: ## BB#47: ## %cond.load67 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_48: ## %else68 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_50 +; AVX1-NEXT: ## BB#49: ## %cond.load70 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_50: ## %else71 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_52 +; AVX1-NEXT: ## BB#51: ## %cond.load73 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_52: ## %else74 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_54 +; AVX1-NEXT: ## BB#53: ## %cond.load76 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_54: ## %else77 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_56 +; AVX1-NEXT: ## BB#55: ## %cond.load79 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_56: ## %else80 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_58 +; AVX1-NEXT: ## BB#57: ## %cond.load82 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_58: ## %else83 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_60 +; AVX1-NEXT: ## BB#59: ## %cond.load85 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_60: ## %else86 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_62 +; AVX1-NEXT: ## BB#61: ## %cond.load88 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_62: ## %else89 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_64 +; AVX1-NEXT: ## BB#63: ## %cond.load91 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9 +; AVX1-NEXT: LBB52_64: ## %else92 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_66 +; AVX1-NEXT: ## BB#65: ## %cond.load94 +; AVX1-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: LBB52_66: ## %else95 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_68 +; AVX1-NEXT: ## BB#67: ## %cond.load97 +; AVX1-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_68: ## %else98 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_70 +; AVX1-NEXT: ## BB#69: ## %cond.load100 +; AVX1-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_70: ## %else101 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_72 +; AVX1-NEXT: ## BB#71: ## %cond.load103 +; AVX1-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_72: ## %else104 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_74 +; AVX1-NEXT: ## BB#73: ## %cond.load106 +; AVX1-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_74: ## %else107 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_76 +; AVX1-NEXT: ## BB#75: ## %cond.load109 +; AVX1-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_76: ## %else110 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_78 +; AVX1-NEXT: ## BB#77: ## %cond.load112 +; AVX1-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_78: ## %else113 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_80 +; AVX1-NEXT: ## BB#79: ## %cond.load115 +; AVX1-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_80: ## %else116 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_82 +; AVX1-NEXT: ## BB#81: ## %cond.load118 +; AVX1-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_82: ## %else119 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_84 +; AVX1-NEXT: ## BB#83: ## %cond.load121 +; AVX1-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_84: ## %else122 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_86 +; AVX1-NEXT: ## BB#85: ## %cond.load124 +; AVX1-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_86: ## %else125 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_88 +; AVX1-NEXT: ## BB#87: ## %cond.load127 +; AVX1-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_88: ## %else128 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_90 +; AVX1-NEXT: ## BB#89: ## %cond.load130 +; AVX1-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_90: ## %else131 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_92 +; AVX1-NEXT: ## BB#91: ## %cond.load133 +; AVX1-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_92: ## %else134 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_94 +; AVX1-NEXT: ## BB#93: ## %cond.load136 +; AVX1-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_94: ## %else137 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_96 +; AVX1-NEXT: ## BB#95: ## %cond.load139 +; AVX1-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB52_96: ## %else140 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_98 +; AVX1-NEXT: ## BB#97: ## %cond.load142 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_98: ## %else143 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_100 +; AVX1-NEXT: ## BB#99: ## %cond.load145 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_100: ## %else146 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_102 +; AVX1-NEXT: ## BB#101: ## %cond.load148 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_102: ## %else149 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_104 +; AVX1-NEXT: ## BB#103: ## %cond.load151 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_104: ## %else152 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_106 +; AVX1-NEXT: ## BB#105: ## %cond.load154 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_106: ## %else155 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_108 +; AVX1-NEXT: ## BB#107: ## %cond.load157 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_108: ## %else158 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_110 +; AVX1-NEXT: ## BB#109: ## %cond.load160 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_110: ## %else161 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_112 +; AVX1-NEXT: ## BB#111: ## %cond.load163 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_112: ## %else164 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_114 +; AVX1-NEXT: ## BB#113: ## %cond.load166 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_114: ## %else167 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_116 +; AVX1-NEXT: ## BB#115: ## %cond.load169 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_116: ## %else170 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_118 +; AVX1-NEXT: ## BB#117: ## %cond.load172 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_118: ## %else173 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_120 +; AVX1-NEXT: ## BB#119: ## %cond.load175 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_120: ## %else176 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_122 +; AVX1-NEXT: ## BB#121: ## %cond.load178 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_122: ## %else179 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_124 +; AVX1-NEXT: ## BB#123: ## %cond.load181 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_124: ## %else182 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: je LBB52_126 +; AVX1-NEXT: ## BB#125: ## %cond.load184 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_126: ## %else185 +; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX1-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movl %r8d, (%rsp) ## 4-byte Spill +; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: je LBB52_128 +; AVX1-NEXT: ## BB#127: ## %cond.load187 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrb $15, 63(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB52_128: ## %else188 +; AVX1-NEXT: movzbl %r10b, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl %r11b, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl %bl, %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; AVX1-NEXT: movzbl %r13b, %r13d +; AVX1-NEXT: vmovd %r13d, %xmm4 +; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload +; AVX1-NEXT: movzbl %dil, %ebp +; AVX1-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX1-NEXT: movzbl %bpl, %ebp +; AVX1-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX1-NEXT: movzbl %bpl, %ebp +; AVX1-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movl (%rsp), %ebp ## 4-byte Reload +; AVX1-NEXT: movzbl %bpl, %ebp +; AVX1-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX1-NEXT: movzbl %bpl, %ebp +; AVX1-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: ## xmm5 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm8 ## 4-byte Folded Reload +; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm6 ## 4-byte Folded Reload +; AVX1-NEXT: ## xmm6 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6 +; AVX1-NEXT: vpinsrb $3, %r15d, %xmm6, %xmm6 +; AVX1-NEXT: vpinsrb $4, %r14d, %xmm6, %xmm6 +; AVX1-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6 +; AVX1-NEXT: vpinsrb $6, %r8d, %xmm6, %xmm6 +; AVX1-NEXT: vpinsrb $7, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; AVX1-NEXT: vpinsrb $10, %esi, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; AVX1-NEXT: vpinsrb $11, %r9d, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX1-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; AVX1-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: vpinsrb $14, %r13d, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; AVX1-NEXT: vpinsrb $15, %r14d, %xmm6, %xmm10 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; AVX1-NEXT: vmovd %edi, %xmm7 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; AVX1-NEXT: vpinsrb $1, %r11d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $2, %r15d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $3, %r12d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $4, %r8d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $6, %r9d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $7, %esi, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $9, %eax, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $10, %r13d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $11, %edx, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $12, %r14d, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $13, %ebx, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $14, %edi, %xmm7, %xmm7 +; AVX1-NEXT: vpinsrb $15, %ebp, %xmm7, %xmm7 +; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsllw $7, %xmm8, %xmm6 +; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vpsllw $7, %xmm10, %xmm4 +; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsllw $7, %xmm7, %xmm6 +; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: addq $8, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_load_64xi8: +; AVX2: ## BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: Ltmp3: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: Ltmp4: +; AVX2-NEXT: .cfi_def_cfa_offset 24 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: Ltmp5: +; AVX2-NEXT: .cfi_def_cfa_offset 32 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: Ltmp6: +; AVX2-NEXT: .cfi_def_cfa_offset 40 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: Ltmp7: +; AVX2-NEXT: .cfi_def_cfa_offset 48 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: Ltmp8: +; AVX2-NEXT: .cfi_def_cfa_offset 56 +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: Ltmp9: +; AVX2-NEXT: .cfi_def_cfa_offset 64 +; AVX2-NEXT: Ltmp10: +; AVX2-NEXT: .cfi_offset %rbx, -56 +; AVX2-NEXT: Ltmp11: +; AVX2-NEXT: .cfi_offset %r12, -48 +; AVX2-NEXT: Ltmp12: +; AVX2-NEXT: .cfi_offset %r13, -40 +; AVX2-NEXT: Ltmp13: +; AVX2-NEXT: .cfi_offset %r14, -32 +; AVX2-NEXT: Ltmp14: +; AVX2-NEXT: .cfi_offset %r15, -24 +; AVX2-NEXT: Ltmp15: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: je LBB52_2 +; AVX2-NEXT: ## BB#1: ## %cond.load +; AVX2-NEXT: movzbl (%rax), %ebp +; AVX2-NEXT: vmovd %ebp, %xmm2 +; AVX2-NEXT: LBB52_2: ## %else +; AVX2-NEXT: testb $1, %sil +; AVX2-NEXT: je LBB52_4 +; AVX2-NEXT: ## BB#3: ## %cond.load1 +; AVX2-NEXT: vpinsrb $1, 1(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_4: ## %else2 +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB52_6 +; AVX2-NEXT: ## BB#5: ## %cond.load4 +; AVX2-NEXT: vpinsrb $2, 2(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_6: ## %else5 +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB52_8 +; AVX2-NEXT: ## BB#7: ## %cond.load7 +; AVX2-NEXT: vpinsrb $3, 3(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_8: ## %else8 +; AVX2-NEXT: testb $1, %r8b +; AVX2-NEXT: je LBB52_10 +; AVX2-NEXT: ## BB#9: ## %cond.load10 +; AVX2-NEXT: vpinsrb $4, 4(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_10: ## %else11 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; AVX2-NEXT: testb $1, %r9b +; AVX2-NEXT: je LBB52_12 +; AVX2-NEXT: ## BB#11: ## %cond.load13 +; AVX2-NEXT: vpinsrb $5, 5(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_12: ## %else14 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; AVX2-NEXT: testb $1, %r10b +; AVX2-NEXT: je LBB52_14 +; AVX2-NEXT: ## BB#13: ## %cond.load16 +; AVX2-NEXT: vpinsrb $6, 6(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_14: ## %else17 +; AVX2-NEXT: testb $1, %r11b +; AVX2-NEXT: je LBB52_16 +; AVX2-NEXT: ## BB#15: ## %cond.load19 +; AVX2-NEXT: vpinsrb $7, 7(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_16: ## %else20 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_18 +; AVX2-NEXT: ## BB#17: ## %cond.load22 +; AVX2-NEXT: vpinsrb $8, 8(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_18: ## %else23 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_20 +; AVX2-NEXT: ## BB#19: ## %cond.load25 +; AVX2-NEXT: vpinsrb $9, 9(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_20: ## %else26 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_22 +; AVX2-NEXT: ## BB#21: ## %cond.load28 +; AVX2-NEXT: vpinsrb $10, 10(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_22: ## %else29 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_24 +; AVX2-NEXT: ## BB#23: ## %cond.load31 +; AVX2-NEXT: vpinsrb $11, 11(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_24: ## %else32 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bl +; AVX2-NEXT: testb $1, %bpl +; AVX2-NEXT: je LBB52_26 +; AVX2-NEXT: ## BB#25: ## %cond.load34 +; AVX2-NEXT: vpinsrb $12, 12(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_26: ## %else35 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; AVX2-NEXT: testb $1, %bl +; AVX2-NEXT: je LBB52_28 +; AVX2-NEXT: ## BB#27: ## %cond.load37 +; AVX2-NEXT: vpinsrb $13, 13(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_28: ## %else38 +; AVX2-NEXT: testb $1, %r14b +; AVX2-NEXT: je LBB52_30 +; AVX2-NEXT: ## BB#29: ## %cond.load40 +; AVX2-NEXT: vpinsrb $14, 14(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_30: ## %else41 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_32 +; AVX2-NEXT: ## BB#31: ## %cond.load43 +; AVX2-NEXT: vpinsrb $15, 15(%rax), %xmm2, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: LBB52_32: ## %else44 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; AVX2-NEXT: testb $1, %r13b +; AVX2-NEXT: je LBB52_34 +; AVX2-NEXT: ## BB#33: ## %cond.load46 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_34: ## %else47 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; AVX2-NEXT: testb $1, %r12b +; AVX2-NEXT: je LBB52_36 +; AVX2-NEXT: ## BB#35: ## %cond.load49 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_36: ## %else50 +; AVX2-NEXT: testb $1, %r15b +; AVX2-NEXT: je LBB52_38 +; AVX2-NEXT: ## BB#37: ## %cond.load52 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_38: ## %else53 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_40 +; AVX2-NEXT: ## BB#39: ## %cond.load55 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_40: ## %else56 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_42 +; AVX2-NEXT: ## BB#41: ## %cond.load58 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_42: ## %else59 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_44 +; AVX2-NEXT: ## BB#43: ## %cond.load61 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_44: ## %else62 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_46 +; AVX2-NEXT: ## BB#45: ## %cond.load64 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_46: ## %else65 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_48 +; AVX2-NEXT: ## BB#47: ## %cond.load67 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_48: ## %else68 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_50 +; AVX2-NEXT: ## BB#49: ## %cond.load70 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_50: ## %else71 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_52 +; AVX2-NEXT: ## BB#51: ## %cond.load73 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_52: ## %else74 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_54 +; AVX2-NEXT: ## BB#53: ## %cond.load76 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_54: ## %else77 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_56 +; AVX2-NEXT: ## BB#55: ## %cond.load79 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_56: ## %else80 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_58 +; AVX2-NEXT: ## BB#57: ## %cond.load82 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_58: ## %else83 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_60 +; AVX2-NEXT: ## BB#59: ## %cond.load85 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_60: ## %else86 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_62 +; AVX2-NEXT: ## BB#61: ## %cond.load88 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_62: ## %else89 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_64 +; AVX2-NEXT: ## BB#63: ## %cond.load91 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: LBB52_64: ## %else92 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_66 +; AVX2-NEXT: ## BB#65: ## %cond.load94 +; AVX2-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: LBB52_66: ## %else95 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_68 +; AVX2-NEXT: ## BB#67: ## %cond.load97 +; AVX2-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_68: ## %else98 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_70 +; AVX2-NEXT: ## BB#69: ## %cond.load100 +; AVX2-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_70: ## %else101 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_72 +; AVX2-NEXT: ## BB#71: ## %cond.load103 +; AVX2-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_72: ## %else104 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_74 +; AVX2-NEXT: ## BB#73: ## %cond.load106 +; AVX2-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_74: ## %else107 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_76 +; AVX2-NEXT: ## BB#75: ## %cond.load109 +; AVX2-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_76: ## %else110 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_78 +; AVX2-NEXT: ## BB#77: ## %cond.load112 +; AVX2-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_78: ## %else113 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_80 +; AVX2-NEXT: ## BB#79: ## %cond.load115 +; AVX2-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_80: ## %else116 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_82 +; AVX2-NEXT: ## BB#81: ## %cond.load118 +; AVX2-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_82: ## %else119 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_84 +; AVX2-NEXT: ## BB#83: ## %cond.load121 +; AVX2-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_84: ## %else122 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_86 +; AVX2-NEXT: ## BB#85: ## %cond.load124 +; AVX2-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_86: ## %else125 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_88 +; AVX2-NEXT: ## BB#87: ## %cond.load127 +; AVX2-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_88: ## %else128 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_90 +; AVX2-NEXT: ## BB#89: ## %cond.load130 +; AVX2-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_90: ## %else131 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_92 +; AVX2-NEXT: ## BB#91: ## %cond.load133 +; AVX2-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_92: ## %else134 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_94 +; AVX2-NEXT: ## BB#93: ## %cond.load136 +; AVX2-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_94: ## %else137 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_96 +; AVX2-NEXT: ## BB#95: ## %cond.load139 +; AVX2-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB52_96: ## %else140 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_98 +; AVX2-NEXT: ## BB#97: ## %cond.load142 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_98: ## %else143 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_100 +; AVX2-NEXT: ## BB#99: ## %cond.load145 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_100: ## %else146 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_102 +; AVX2-NEXT: ## BB#101: ## %cond.load148 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_102: ## %else149 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_104 +; AVX2-NEXT: ## BB#103: ## %cond.load151 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_104: ## %else152 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_106 +; AVX2-NEXT: ## BB#105: ## %cond.load154 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_106: ## %else155 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_108 +; AVX2-NEXT: ## BB#107: ## %cond.load157 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_108: ## %else158 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_110 +; AVX2-NEXT: ## BB#109: ## %cond.load160 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_110: ## %else161 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_112 +; AVX2-NEXT: ## BB#111: ## %cond.load163 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_112: ## %else164 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_114 +; AVX2-NEXT: ## BB#113: ## %cond.load166 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_114: ## %else167 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_116 +; AVX2-NEXT: ## BB#115: ## %cond.load169 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_116: ## %else170 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_118 +; AVX2-NEXT: ## BB#117: ## %cond.load172 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_118: ## %else173 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_120 +; AVX2-NEXT: ## BB#119: ## %cond.load175 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_120: ## %else176 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_122 +; AVX2-NEXT: ## BB#121: ## %cond.load178 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_122: ## %else179 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: je LBB52_124 +; AVX2-NEXT: ## BB#123: ## %cond.load181 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_124: ## %else182 +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: jne LBB52_126 +; AVX2-NEXT: ## BB#125: +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: jmp LBB52_127 +; AVX2-NEXT: LBB52_126: ## %cond.load184 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_127: ## %else185 +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movl %r8d, (%rsp) ## 4-byte Spill +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movl %esi, %ebp +; AVX2-NEXT: je LBB52_129 +; AVX2-NEXT: ## BB#128: ## %cond.load187 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrb $15, 63(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB52_129: ## %else188 +; AVX2-NEXT: movzbl %r10b, %ecx +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl %r11b, %ecx +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl %bl, %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload +; AVX2-NEXT: movzbl %dil, %r13d +; AVX2-NEXT: vmovd %r13d, %xmm4 +; AVX2-NEXT: movzbl %bpl, %ebp +; AVX2-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX2-NEXT: movzbl %bpl, %ebp +; AVX2-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX2-NEXT: movzbl %bpl, %ebp +; AVX2-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movl (%rsp), %ebp ## 4-byte Reload +; AVX2-NEXT: movzbl %bpl, %ebp +; AVX2-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX2-NEXT: movzbl %bpl, %ebp +; AVX2-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload +; AVX2-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: ## xmm5 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; AVX2-NEXT: vmovd %r12d, %xmm6 +; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %r15d, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $3, %r14d, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $4, %ebx, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $6, %r9d, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $7, %esi, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; AVX2-NEXT: vpinsrb $10, %edx, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; AVX2-NEXT: vpinsrb $11, %r8d, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX2-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; AVX2-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; AVX2-NEXT: vmovd %r12d, %xmm7 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; AVX2-NEXT: vpinsrb $1, %r9d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $2, %r11d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $3, %r14d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $6, %r8d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $7, %ebx, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $9, %ebp, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $11, %edi, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $12, %r15d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $13, %esi, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $15, %edx, %xmm7, %xmm7 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsllw $7, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm2 +; AVX2-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_load_64xi8: +; AVX512F: ## BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: Ltmp0: +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: pushq %r15 +; AVX512F-NEXT: Ltmp1: +; AVX512F-NEXT: .cfi_def_cfa_offset 24 +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: Ltmp2: +; AVX512F-NEXT: .cfi_def_cfa_offset 32 +; AVX512F-NEXT: pushq %r13 +; AVX512F-NEXT: Ltmp3: +; AVX512F-NEXT: .cfi_def_cfa_offset 40 +; AVX512F-NEXT: pushq %r12 +; AVX512F-NEXT: Ltmp4: +; AVX512F-NEXT: .cfi_def_cfa_offset 48 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: Ltmp5: +; AVX512F-NEXT: .cfi_def_cfa_offset 56 +; AVX512F-NEXT: subq $76, %rsp +; AVX512F-NEXT: Ltmp6: +; AVX512F-NEXT: .cfi_def_cfa_offset 132 +; AVX512F-NEXT: Ltmp7: +; AVX512F-NEXT: .cfi_offset %rbx, -56 +; AVX512F-NEXT: Ltmp8: +; AVX512F-NEXT: .cfi_offset %r12, -48 +; AVX512F-NEXT: Ltmp9: +; AVX512F-NEXT: .cfi_offset %r13, -40 +; AVX512F-NEXT: Ltmp10: +; AVX512F-NEXT: .cfi_offset %r14, -32 +; AVX512F-NEXT: Ltmp11: +; AVX512F-NEXT: .cfi_offset %r15, -24 +; AVX512F-NEXT: Ltmp12: +; AVX512F-NEXT: .cfi_offset %rbp, -16 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_2 +; AVX512F-NEXT: ## BB#1: ## %cond.load +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: LBB52_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_4 +; AVX512F-NEXT: ## BB#3: ## %cond.load1 +; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_6 +; AVX512F-NEXT: ## BB#5: ## %cond.load4 +; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_6: ## %else5 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_8 +; AVX512F-NEXT: ## BB#7: ## %cond.load7 +; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_8: ## %else8 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_10 +; AVX512F-NEXT: ## BB#9: ## %cond.load10 +; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_10: ## %else11 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_12 +; AVX512F-NEXT: ## BB#11: ## %cond.load13 +; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_12: ## %else14 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_14 +; AVX512F-NEXT: ## BB#13: ## %cond.load16 +; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_14: ## %else17 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_16 +; AVX512F-NEXT: ## BB#15: ## %cond.load19 +; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_16: ## %else20 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_18 +; AVX512F-NEXT: ## BB#17: ## %cond.load22 +; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_18: ## %else23 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, (%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_20 +; AVX512F-NEXT: ## BB#19: ## %cond.load25 +; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_20: ## %else26 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_22 +; AVX512F-NEXT: ## BB#21: ## %cond.load28 +; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_22: ## %else29 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_24 +; AVX512F-NEXT: ## BB#23: ## %cond.load31 +; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_24: ## %else32 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_26 +; AVX512F-NEXT: ## BB#25: ## %cond.load34 +; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_26: ## %else35 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_28 +; AVX512F-NEXT: ## BB#27: ## %cond.load37 +; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_28: ## %else38 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_30 +; AVX512F-NEXT: ## BB#29: ## %cond.load40 +; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_30: ## %else41 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: kshiftlw $0, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_32 +; AVX512F-NEXT: ## BB#31: ## %cond.load43 +; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_32: ## %else44 +; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_34 +; AVX512F-NEXT: ## BB#33: ## %cond.load46 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_34: ## %else47 +; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_36 +; AVX512F-NEXT: ## BB#35: ## %cond.load49 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_36: ## %else50 +; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_38 +; AVX512F-NEXT: ## BB#37: ## %cond.load52 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_38: ## %else53 +; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_40 +; AVX512F-NEXT: ## BB#39: ## %cond.load55 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_40: ## %else56 +; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_42 +; AVX512F-NEXT: ## BB#41: ## %cond.load58 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_42: ## %else59 +; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_44 +; AVX512F-NEXT: ## BB#43: ## %cond.load61 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_44: ## %else62 +; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_46 +; AVX512F-NEXT: ## BB#45: ## %cond.load64 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_46: ## %else65 +; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_48 +; AVX512F-NEXT: ## BB#47: ## %cond.load67 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_48: ## %else68 +; AVX512F-NEXT: kshiftlw $7, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_50 +; AVX512F-NEXT: ## BB#49: ## %cond.load70 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_50: ## %else71 +; AVX512F-NEXT: kshiftlw $6, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_52 +; AVX512F-NEXT: ## BB#51: ## %cond.load73 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_52: ## %else74 +; AVX512F-NEXT: kshiftlw $5, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_54 +; AVX512F-NEXT: ## BB#53: ## %cond.load76 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_54: ## %else77 +; AVX512F-NEXT: kshiftlw $4, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_56 +; AVX512F-NEXT: ## BB#55: ## %cond.load79 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_56: ## %else80 +; AVX512F-NEXT: kshiftlw $3, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_58 +; AVX512F-NEXT: ## BB#57: ## %cond.load82 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_58: ## %else83 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm1 +; AVX512F-NEXT: kshiftlw $2, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_60 +; AVX512F-NEXT: ## BB#59: ## %cond.load85 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_60: ## %else86 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: kshiftlw $1, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_62 +; AVX512F-NEXT: ## BB#61: ## %cond.load88 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_62: ## %else89 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $0, %k1, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_64 +; AVX512F-NEXT: ## BB#63: ## %cond.load91 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB52_64: ## %else92 +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_66 +; AVX512F-NEXT: ## BB#65: ## %cond.load94 +; AVX512F-NEXT: vpinsrb $0, 32(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB52_66: ## %else95 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_68 +; AVX512F-NEXT: ## BB#67: ## %cond.load97 +; AVX512F-NEXT: vpinsrb $1, 33(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_68: ## %else98 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_70 +; AVX512F-NEXT: ## BB#69: ## %cond.load100 +; AVX512F-NEXT: vpinsrb $2, 34(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_70: ## %else101 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_72 +; AVX512F-NEXT: ## BB#71: ## %cond.load103 +; AVX512F-NEXT: vpinsrb $3, 35(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_72: ## %else104 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_74 +; AVX512F-NEXT: ## BB#73: ## %cond.load106 +; AVX512F-NEXT: vpinsrb $4, 36(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_74: ## %else107 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_76 +; AVX512F-NEXT: ## BB#75: ## %cond.load109 +; AVX512F-NEXT: vpinsrb $5, 37(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_76: ## %else110 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_78 +; AVX512F-NEXT: ## BB#77: ## %cond.load112 +; AVX512F-NEXT: vpinsrb $6, 38(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_78: ## %else113 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_80 +; AVX512F-NEXT: ## BB#79: ## %cond.load115 +; AVX512F-NEXT: vpinsrb $7, 39(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_80: ## %else116 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_82 +; AVX512F-NEXT: ## BB#81: ## %cond.load118 +; AVX512F-NEXT: vpinsrb $8, 40(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_82: ## %else119 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_84 +; AVX512F-NEXT: ## BB#83: ## %cond.load121 +; AVX512F-NEXT: vpinsrb $9, 41(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_84: ## %else122 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_86 +; AVX512F-NEXT: ## BB#85: ## %cond.load124 +; AVX512F-NEXT: vpinsrb $10, 42(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_86: ## %else125 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_88 +; AVX512F-NEXT: ## BB#87: ## %cond.load127 +; AVX512F-NEXT: vpinsrb $11, 43(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_88: ## %else128 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_90 +; AVX512F-NEXT: ## BB#89: ## %cond.load130 +; AVX512F-NEXT: vpinsrb $12, 44(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_90: ## %else131 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm2 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_92 +; AVX512F-NEXT: ## BB#91: ## %cond.load133 +; AVX512F-NEXT: vpinsrb $13, 45(%rdi), %xmm1, %xmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_92: ## %else134 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_94 +; AVX512F-NEXT: ## BB#93: ## %cond.load136 +; AVX512F-NEXT: vpinsrb $14, 46(%rdi), %xmm1, %xmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_94: ## %else137 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-NEXT: kshiftlw $0, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_96 +; AVX512F-NEXT: ## BB#95: ## %cond.load139 +; AVX512F-NEXT: vpinsrb $15, 47(%rdi), %xmm1, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: LBB52_96: ## %else140 +; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_98 +; AVX512F-NEXT: ## BB#97: ## %cond.load142 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $0, 48(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_98: ## %else143 +; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_100 +; AVX512F-NEXT: ## BB#99: ## %cond.load145 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $1, 49(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_100: ## %else146 +; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_102 +; AVX512F-NEXT: ## BB#101: ## %cond.load148 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $2, 50(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_102: ## %else149 +; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_104 +; AVX512F-NEXT: ## BB#103: ## %cond.load151 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $3, 51(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_104: ## %else152 +; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_106 +; AVX512F-NEXT: ## BB#105: ## %cond.load154 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $4, 52(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_106: ## %else155 +; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_108 +; AVX512F-NEXT: ## BB#107: ## %cond.load157 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $5, 53(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_108: ## %else158 +; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_110 +; AVX512F-NEXT: ## BB#109: ## %cond.load160 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $6, 54(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_110: ## %else161 +; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_112 +; AVX512F-NEXT: ## BB#111: ## %cond.load163 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $7, 55(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_112: ## %else164 +; AVX512F-NEXT: kshiftlw $7, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Folded Spill +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_114 +; AVX512F-NEXT: ## BB#113: ## %cond.load166 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $8, 56(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_114: ## %else167 +; AVX512F-NEXT: kshiftlw $6, %k1, %k2 +; AVX512F-NEXT: kshiftrw $15, %k2, %k2 +; AVX512F-NEXT: kmovw %k2, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_116 +; AVX512F-NEXT: ## BB#115: ## %cond.load169 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $9, 57(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_116: ## %else170 +; AVX512F-NEXT: kshiftlw $5, %k1, %k3 +; AVX512F-NEXT: kshiftrw $15, %k3, %k3 +; AVX512F-NEXT: kmovw %k3, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_118 +; AVX512F-NEXT: ## BB#117: ## %cond.load172 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $10, 58(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_118: ## %else173 +; AVX512F-NEXT: kshiftlw $4, %k1, %k4 +; AVX512F-NEXT: kshiftrw $15, %k4, %k4 +; AVX512F-NEXT: kmovw %k4, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_120 +; AVX512F-NEXT: ## BB#119: ## %cond.load175 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $11, 59(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_120: ## %else176 +; AVX512F-NEXT: kshiftlw $3, %k1, %k5 +; AVX512F-NEXT: kshiftrw $15, %k5, %k5 +; AVX512F-NEXT: kmovw %k5, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_122 +; AVX512F-NEXT: ## BB#121: ## %cond.load178 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $12, 60(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_122: ## %else179 +; AVX512F-NEXT: kshiftlw $2, %k1, %k6 +; AVX512F-NEXT: kshiftrw $15, %k6, %k6 +; AVX512F-NEXT: kmovw %k6, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_124 +; AVX512F-NEXT: ## BB#123: ## %cond.load181 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $13, 61(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_124: ## %else182 +; AVX512F-NEXT: kshiftlw $1, %k1, %k7 +; AVX512F-NEXT: kshiftrw $15, %k7, %k7 +; AVX512F-NEXT: kmovw %k7, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_126 +; AVX512F-NEXT: ## BB#125: ## %cond.load184 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $14, 62(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_126: ## %else185 +; AVX512F-NEXT: kshiftlw $0, %k1, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB52_128 +; AVX512F-NEXT: ## BB#127: ## %cond.load187 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpinsrb $15, 63(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: LBB52_128: ## %else188 +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw (%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, (%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw %k2, %eax +; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw %k3, %r12d +; AVX512F-NEXT: kmovw %k4, %r15d +; AVX512F-NEXT: kmovw %k5, %r14d +; AVX512F-NEXT: kmovw %k6, %ebx +; AVX512F-NEXT: kmovw %k7, %r11d +; AVX512F-NEXT: kmovw %k1, %r10d +; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r8d +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r9d +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %edi +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %esi +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %edx +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %ecx +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %r13d ## 4-byte Reload +; AVX512F-NEXT: vmovd %r13d, %xmm2 +; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX512F-NEXT: vmovd %ebp, %xmm3 +; AVX512F-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $9, (%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload +; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload +; AVX512F-NEXT: vmovd %ebp, %xmm6 +; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r13d +; AVX512F-NEXT: vpinsrb $10, %r12d, %xmm6, %xmm6 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r12d +; AVX512F-NEXT: vpinsrb $11, %r15d, %xmm6, %xmm6 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r15d +; AVX512F-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r14d +; AVX512F-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %ebx +; AVX512F-NEXT: vpinsrb $14, %r11d, %xmm6, %xmm6 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r11d +; AVX512F-NEXT: vpinsrb $15, %r10d, %xmm6, %xmm6 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r10d +; AVX512F-NEXT: vmovd %r8d, %xmm7 +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload +; AVX512F-NEXT: kmovw %k0, %r8d +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm7, %xmm2 ## 4-byte Folded Reload +; AVX512F-NEXT: vpinsrb $2, %r9d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $3, %edi, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $8, %r13d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $9, %r12d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $10, %r15d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $11, %r14d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $12, %ebx, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $13, %r11d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $14, %r10d, %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $15, %r8d, %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: addq $76, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r12 +; AVX512F-NEXT: popq %r13 +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: popq %r15 +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_load_64xi8: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 @@ -2258,6 +5545,145 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>) define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) { +; AVX-LABEL: test_mask_load_8xi16: +; AVX: ## BB#0: +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: ## implicit-def: %XMM1 +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_2 +; AVX-NEXT: ## BB#1: ## %cond.load +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: LBB53_2: ## %else +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_4 +; AVX-NEXT: ## BB#3: ## %cond.load1 +; AVX-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB53_4: ## %else2 +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_6 +; AVX-NEXT: ## BB#5: ## %cond.load4 +; AVX-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB53_6: ## %else5 +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_8 +; AVX-NEXT: ## BB#7: ## %cond.load7 +; AVX-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB53_8: ## %else8 +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_10 +; AVX-NEXT: ## BB#9: ## %cond.load10 +; AVX-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB53_10: ## %else11 +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_12 +; AVX-NEXT: ## BB#11: ## %cond.load13 +; AVX-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB53_12: ## %else14 +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_14 +; AVX-NEXT: ## BB#13: ## %cond.load16 +; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB53_14: ## %else17 +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB53_16 +; AVX-NEXT: ## BB#15: ## %cond.load19 +; AVX-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 +; AVX-NEXT: LBB53_16: ## %else20 +; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test_mask_load_8xi16: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: ## implicit-def: %XMM0 +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_2 +; AVX512F-NEXT: ## BB#1: ## %cond.load +; AVX512F-NEXT: movzwl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: LBB53_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_4 +; AVX512F-NEXT: ## BB#3: ## %cond.load1 +; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB53_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_6 +; AVX512F-NEXT: ## BB#5: ## %cond.load4 +; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB53_6: ## %else5 +; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_8 +; AVX512F-NEXT: ## BB#7: ## %cond.load7 +; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB53_8: ## %else8 +; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_10 +; AVX512F-NEXT: ## BB#9: ## %cond.load10 +; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB53_10: ## %else11 +; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_12 +; AVX512F-NEXT: ## BB#11: ## %cond.load13 +; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB53_12: ## %else14 +; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_14 +; AVX512F-NEXT: ## BB#13: ## %cond.load16 +; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB53_14: ## %else17 +; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB53_16 +; AVX512F-NEXT: ## BB#15: ## %cond.load19 +; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: LBB53_16: ## %else20 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_load_8xi16: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -2270,6 +5696,431 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) { +; AVX1-LABEL: test_mask_load_16xi16: +; AVX1: ## BB#0: +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: ## implicit-def: %YMM1 +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_2 +; AVX1-NEXT: ## BB#1: ## %cond.load +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: LBB54_2: ## %else +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_4 +; AVX1-NEXT: ## BB#3: ## %cond.load1 +; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB54_4: ## %else2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_6 +; AVX1-NEXT: ## BB#5: ## %cond.load4 +; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB54_6: ## %else5 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_8 +; AVX1-NEXT: ## BB#7: ## %cond.load7 +; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB54_8: ## %else8 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_10 +; AVX1-NEXT: ## BB#9: ## %cond.load10 +; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB54_10: ## %else11 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_12 +; AVX1-NEXT: ## BB#11: ## %cond.load13 +; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB54_12: ## %else14 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_14 +; AVX1-NEXT: ## BB#13: ## %cond.load16 +; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB54_14: ## %else17 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_16 +; AVX1-NEXT: ## BB#15: ## %cond.load19 +; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: LBB54_16: ## %else20 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_18 +; AVX1-NEXT: ## BB#17: ## %cond.load22 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_18: ## %else23 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_20 +; AVX1-NEXT: ## BB#19: ## %cond.load25 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_20: ## %else26 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_22 +; AVX1-NEXT: ## BB#21: ## %cond.load28 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_22: ## %else29 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_24 +; AVX1-NEXT: ## BB#23: ## %cond.load31 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_24: ## %else32 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_26 +; AVX1-NEXT: ## BB#25: ## %cond.load34 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_26: ## %else35 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_28 +; AVX1-NEXT: ## BB#27: ## %cond.load37 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_28: ## %else38 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_30 +; AVX1-NEXT: ## BB#29: ## %cond.load40 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_30: ## %else41 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB54_32 +; AVX1-NEXT: ## BB#31: ## %cond.load43 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: LBB54_32: ## %else44 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_load_16xi16: +; AVX2: ## BB#0: +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: ## implicit-def: %YMM1 +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_2 +; AVX2-NEXT: ## BB#1: ## %cond.load +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: LBB54_2: ## %else +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_4 +; AVX2-NEXT: ## BB#3: ## %cond.load1 +; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB54_4: ## %else2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_6 +; AVX2-NEXT: ## BB#5: ## %cond.load4 +; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB54_6: ## %else5 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_8 +; AVX2-NEXT: ## BB#7: ## %cond.load7 +; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB54_8: ## %else8 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_10 +; AVX2-NEXT: ## BB#9: ## %cond.load10 +; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB54_10: ## %else11 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_12 +; AVX2-NEXT: ## BB#11: ## %cond.load13 +; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB54_12: ## %else14 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_14 +; AVX2-NEXT: ## BB#13: ## %cond.load16 +; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB54_14: ## %else17 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_16 +; AVX2-NEXT: ## BB#15: ## %cond.load19 +; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: LBB54_16: ## %else20 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_18 +; AVX2-NEXT: ## BB#17: ## %cond.load22 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_18: ## %else23 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_20 +; AVX2-NEXT: ## BB#19: ## %cond.load25 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_20: ## %else26 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_22 +; AVX2-NEXT: ## BB#21: ## %cond.load28 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_22: ## %else29 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_24 +; AVX2-NEXT: ## BB#23: ## %cond.load31 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_24: ## %else32 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_26 +; AVX2-NEXT: ## BB#25: ## %cond.load34 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_26: ## %else35 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_28 +; AVX2-NEXT: ## BB#27: ## %cond.load37 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_28: ## %else38 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_30 +; AVX2-NEXT: ## BB#29: ## %cond.load40 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_30: ## %else41 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB54_32 +; AVX2-NEXT: ## BB#31: ## %cond.load43 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: LBB54_32: ## %else44 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_load_16xi16: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: ## implicit-def: %YMM0 +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_2 +; AVX512F-NEXT: ## BB#1: ## %cond.load +; AVX512F-NEXT: movzwl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: LBB54_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_4 +; AVX512F-NEXT: ## BB#3: ## %cond.load1 +; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB54_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_6 +; AVX512F-NEXT: ## BB#5: ## %cond.load4 +; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB54_6: ## %else5 +; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_8 +; AVX512F-NEXT: ## BB#7: ## %cond.load7 +; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB54_8: ## %else8 +; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_10 +; AVX512F-NEXT: ## BB#9: ## %cond.load10 +; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB54_10: ## %else11 +; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_12 +; AVX512F-NEXT: ## BB#11: ## %cond.load13 +; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB54_12: ## %else14 +; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_14 +; AVX512F-NEXT: ## BB#13: ## %cond.load16 +; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB54_14: ## %else17 +; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_16 +; AVX512F-NEXT: ## BB#15: ## %cond.load19 +; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB54_16: ## %else20 +; AVX512F-NEXT: kshiftlw $7, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_18 +; AVX512F-NEXT: ## BB#17: ## %cond.load22 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_18: ## %else23 +; AVX512F-NEXT: kshiftlw $6, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_20 +; AVX512F-NEXT: ## BB#19: ## %cond.load25 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_20: ## %else26 +; AVX512F-NEXT: kshiftlw $5, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_22 +; AVX512F-NEXT: ## BB#21: ## %cond.load28 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_22: ## %else29 +; AVX512F-NEXT: kshiftlw $4, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_24 +; AVX512F-NEXT: ## BB#23: ## %cond.load31 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_24: ## %else32 +; AVX512F-NEXT: kshiftlw $3, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_26 +; AVX512F-NEXT: ## BB#25: ## %cond.load34 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_26: ## %else35 +; AVX512F-NEXT: kshiftlw $2, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_28 +; AVX512F-NEXT: ## BB#27: ## %cond.load37 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_28: ## %else38 +; AVX512F-NEXT: kshiftlw $1, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_30 +; AVX512F-NEXT: ## BB#29: ## %cond.load40 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_30: ## %else41 +; AVX512F-NEXT: kshiftlw $0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB54_32 +; AVX512F-NEXT: ## BB#31: ## %cond.load43 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: LBB54_32: ## %else44 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_load_16xi16: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -2282,6 +6133,777 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>) define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) { +; AVX1-LABEL: test_mask_load_32xi16: +; AVX1: ## BB#0: +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_2 +; AVX1-NEXT: ## BB#1: ## %cond.load +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: LBB55_2: ## %else +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_4 +; AVX1-NEXT: ## BB#3: ## %cond.load1 +; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB55_4: ## %else2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_6 +; AVX1-NEXT: ## BB#5: ## %cond.load4 +; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB55_6: ## %else5 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_8 +; AVX1-NEXT: ## BB#7: ## %cond.load7 +; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB55_8: ## %else8 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_10 +; AVX1-NEXT: ## BB#9: ## %cond.load10 +; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB55_10: ## %else11 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_12 +; AVX1-NEXT: ## BB#11: ## %cond.load13 +; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB55_12: ## %else14 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_14 +; AVX1-NEXT: ## BB#13: ## %cond.load16 +; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB55_14: ## %else17 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_16 +; AVX1-NEXT: ## BB#15: ## %cond.load19 +; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: LBB55_16: ## %else20 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_18 +; AVX1-NEXT: ## BB#17: ## %cond.load22 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_18: ## %else23 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_20 +; AVX1-NEXT: ## BB#19: ## %cond.load25 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_20: ## %else26 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_22 +; AVX1-NEXT: ## BB#21: ## %cond.load28 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_22: ## %else29 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_24 +; AVX1-NEXT: ## BB#23: ## %cond.load31 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_24: ## %else32 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_26 +; AVX1-NEXT: ## BB#25: ## %cond.load34 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_26: ## %else35 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_28 +; AVX1-NEXT: ## BB#27: ## %cond.load37 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_28: ## %else38 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_30 +; AVX1-NEXT: ## BB#29: ## %cond.load40 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_30: ## %else41 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_32 +; AVX1-NEXT: ## BB#31: ## %cond.load43 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: LBB55_32: ## %else44 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpextrb $0, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_34 +; AVX1-NEXT: ## BB#33: ## %cond.load46 +; AVX1-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: LBB55_34: ## %else47 +; AVX1-NEXT: vpextrb $1, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_36 +; AVX1-NEXT: ## BB#35: ## %cond.load49 +; AVX1-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: LBB55_36: ## %else50 +; AVX1-NEXT: vpextrb $2, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_38 +; AVX1-NEXT: ## BB#37: ## %cond.load52 +; AVX1-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: LBB55_38: ## %else53 +; AVX1-NEXT: vpextrb $3, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_40 +; AVX1-NEXT: ## BB#39: ## %cond.load55 +; AVX1-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: LBB55_40: ## %else56 +; AVX1-NEXT: vpextrb $4, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_42 +; AVX1-NEXT: ## BB#41: ## %cond.load58 +; AVX1-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: LBB55_42: ## %else59 +; AVX1-NEXT: vpextrb $5, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_44 +; AVX1-NEXT: ## BB#43: ## %cond.load61 +; AVX1-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: LBB55_44: ## %else62 +; AVX1-NEXT: vpextrb $6, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_46 +; AVX1-NEXT: ## BB#45: ## %cond.load64 +; AVX1-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: LBB55_46: ## %else65 +; AVX1-NEXT: vpextrb $7, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_48 +; AVX1-NEXT: ## BB#47: ## %cond.load67 +; AVX1-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-NEXT: LBB55_48: ## %else68 +; AVX1-NEXT: vpextrb $8, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_50 +; AVX1-NEXT: ## BB#49: ## %cond.load70 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_50: ## %else71 +; AVX1-NEXT: vpextrb $9, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_52 +; AVX1-NEXT: ## BB#51: ## %cond.load73 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_52: ## %else74 +; AVX1-NEXT: vpextrb $10, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_54 +; AVX1-NEXT: ## BB#53: ## %cond.load76 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_54: ## %else77 +; AVX1-NEXT: vpextrb $11, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_56 +; AVX1-NEXT: ## BB#55: ## %cond.load79 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_56: ## %else80 +; AVX1-NEXT: vpextrb $12, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_58 +; AVX1-NEXT: ## BB#57: ## %cond.load82 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_58: ## %else83 +; AVX1-NEXT: vpextrb $13, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_60 +; AVX1-NEXT: ## BB#59: ## %cond.load85 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_60: ## %else86 +; AVX1-NEXT: vpextrb $14, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_62 +; AVX1-NEXT: ## BB#61: ## %cond.load88 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_62: ## %else89 +; AVX1-NEXT: vpextrb $15, %xmm4, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB55_64 +; AVX1-NEXT: ## BB#63: ## %cond.load91 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: LBB55_64: ## %else92 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $15, %xmm6, %xmm6 +; AVX1-NEXT: vpsraw $15, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vandps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsllw $15, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $15, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vandnps %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vandps %ymm1, %ymm5, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_load_32xi16: +; AVX2: ## BB#0: +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_2 +; AVX2-NEXT: ## BB#1: ## %cond.load +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: LBB55_2: ## %else +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_4 +; AVX2-NEXT: ## BB#3: ## %cond.load1 +; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB55_4: ## %else2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_6 +; AVX2-NEXT: ## BB#5: ## %cond.load4 +; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB55_6: ## %else5 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_8 +; AVX2-NEXT: ## BB#7: ## %cond.load7 +; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB55_8: ## %else8 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_10 +; AVX2-NEXT: ## BB#9: ## %cond.load10 +; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB55_10: ## %else11 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_12 +; AVX2-NEXT: ## BB#11: ## %cond.load13 +; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB55_12: ## %else14 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_14 +; AVX2-NEXT: ## BB#13: ## %cond.load16 +; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB55_14: ## %else17 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_16 +; AVX2-NEXT: ## BB#15: ## %cond.load19 +; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: LBB55_16: ## %else20 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_18 +; AVX2-NEXT: ## BB#17: ## %cond.load22 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_18: ## %else23 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_20 +; AVX2-NEXT: ## BB#19: ## %cond.load25 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_20: ## %else26 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_22 +; AVX2-NEXT: ## BB#21: ## %cond.load28 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_22: ## %else29 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_24 +; AVX2-NEXT: ## BB#23: ## %cond.load31 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_24: ## %else32 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_26 +; AVX2-NEXT: ## BB#25: ## %cond.load34 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_26: ## %else35 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_28 +; AVX2-NEXT: ## BB#27: ## %cond.load37 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_28: ## %else38 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_30 +; AVX2-NEXT: ## BB#29: ## %cond.load40 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_30: ## %else41 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_32 +; AVX2-NEXT: ## BB#31: ## %cond.load43 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: LBB55_32: ## %else44 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpextrb $0, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_34 +; AVX2-NEXT: ## BB#33: ## %cond.load46 +; AVX2-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: LBB55_34: ## %else47 +; AVX2-NEXT: vpextrb $1, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_36 +; AVX2-NEXT: ## BB#35: ## %cond.load49 +; AVX2-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: LBB55_36: ## %else50 +; AVX2-NEXT: vpextrb $2, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_38 +; AVX2-NEXT: ## BB#37: ## %cond.load52 +; AVX2-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: LBB55_38: ## %else53 +; AVX2-NEXT: vpextrb $3, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_40 +; AVX2-NEXT: ## BB#39: ## %cond.load55 +; AVX2-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: LBB55_40: ## %else56 +; AVX2-NEXT: vpextrb $4, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_42 +; AVX2-NEXT: ## BB#41: ## %cond.load58 +; AVX2-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: LBB55_42: ## %else59 +; AVX2-NEXT: vpextrb $5, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_44 +; AVX2-NEXT: ## BB#43: ## %cond.load61 +; AVX2-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: LBB55_44: ## %else62 +; AVX2-NEXT: vpextrb $6, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_46 +; AVX2-NEXT: ## BB#45: ## %cond.load64 +; AVX2-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: LBB55_46: ## %else65 +; AVX2-NEXT: vpextrb $7, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_48 +; AVX2-NEXT: ## BB#47: ## %cond.load67 +; AVX2-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: LBB55_48: ## %else68 +; AVX2-NEXT: vpextrb $8, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_50 +; AVX2-NEXT: ## BB#49: ## %cond.load70 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_50: ## %else71 +; AVX2-NEXT: vpextrb $9, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_52 +; AVX2-NEXT: ## BB#51: ## %cond.load73 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_52: ## %else74 +; AVX2-NEXT: vpextrb $10, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_54 +; AVX2-NEXT: ## BB#53: ## %cond.load76 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_54: ## %else77 +; AVX2-NEXT: vpextrb $11, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_56 +; AVX2-NEXT: ## BB#55: ## %cond.load79 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_56: ## %else80 +; AVX2-NEXT: vpextrb $12, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_58 +; AVX2-NEXT: ## BB#57: ## %cond.load82 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_58: ## %else83 +; AVX2-NEXT: vpextrb $13, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_60 +; AVX2-NEXT: ## BB#59: ## %cond.load85 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_60: ## %else86 +; AVX2-NEXT: vpextrb $14, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_62 +; AVX2-NEXT: ## BB#61: ## %cond.load88 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_62: ## %else89 +; AVX2-NEXT: vpextrb $15, %xmm4, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB55_64 +; AVX2-NEXT: ## BB#63: ## %cond.load91 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: LBB55_64: ## %else92 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_load_32xi16: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpextrb $0, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_2 +; AVX512F-NEXT: ## BB#1: ## %cond.load +; AVX512F-NEXT: movzwl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: LBB55_2: ## %else +; AVX512F-NEXT: vpextrb $1, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_4 +; AVX512F-NEXT: ## BB#3: ## %cond.load1 +; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: LBB55_4: ## %else2 +; AVX512F-NEXT: vpextrb $2, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_6 +; AVX512F-NEXT: ## BB#5: ## %cond.load4 +; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: LBB55_6: ## %else5 +; AVX512F-NEXT: vpextrb $3, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_8 +; AVX512F-NEXT: ## BB#7: ## %cond.load7 +; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: LBB55_8: ## %else8 +; AVX512F-NEXT: vpextrb $4, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_10 +; AVX512F-NEXT: ## BB#9: ## %cond.load10 +; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: LBB55_10: ## %else11 +; AVX512F-NEXT: vpextrb $5, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_12 +; AVX512F-NEXT: ## BB#11: ## %cond.load13 +; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: LBB55_12: ## %else14 +; AVX512F-NEXT: vpextrb $6, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_14 +; AVX512F-NEXT: ## BB#13: ## %cond.load16 +; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: LBB55_14: ## %else17 +; AVX512F-NEXT: vpextrb $7, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_16 +; AVX512F-NEXT: ## BB#15: ## %cond.load19 +; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: LBB55_16: ## %else20 +; AVX512F-NEXT: vpextrb $8, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_18 +; AVX512F-NEXT: ## BB#17: ## %cond.load22 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_18: ## %else23 +; AVX512F-NEXT: vpextrb $9, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_20 +; AVX512F-NEXT: ## BB#19: ## %cond.load25 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_20: ## %else26 +; AVX512F-NEXT: vpextrb $10, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_22 +; AVX512F-NEXT: ## BB#21: ## %cond.load28 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_22: ## %else29 +; AVX512F-NEXT: vpextrb $11, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_24 +; AVX512F-NEXT: ## BB#23: ## %cond.load31 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_24: ## %else32 +; AVX512F-NEXT: vpextrb $12, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_26 +; AVX512F-NEXT: ## BB#25: ## %cond.load34 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_26: ## %else35 +; AVX512F-NEXT: vpextrb $13, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_28 +; AVX512F-NEXT: ## BB#27: ## %cond.load37 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_28: ## %else38 +; AVX512F-NEXT: vpextrb $14, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_30 +; AVX512F-NEXT: ## BB#29: ## %cond.load40 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_30: ## %else41 +; AVX512F-NEXT: vpextrb $15, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_32 +; AVX512F-NEXT: ## BB#31: ## %cond.load43 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: LBB55_32: ## %else44 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vpextrb $0, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_34 +; AVX512F-NEXT: ## BB#33: ## %cond.load46 +; AVX512F-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: LBB55_34: ## %else47 +; AVX512F-NEXT: vpextrb $1, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_36 +; AVX512F-NEXT: ## BB#35: ## %cond.load49 +; AVX512F-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: LBB55_36: ## %else50 +; AVX512F-NEXT: vpextrb $2, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_38 +; AVX512F-NEXT: ## BB#37: ## %cond.load52 +; AVX512F-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: LBB55_38: ## %else53 +; AVX512F-NEXT: vpextrb $3, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_40 +; AVX512F-NEXT: ## BB#39: ## %cond.load55 +; AVX512F-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: LBB55_40: ## %else56 +; AVX512F-NEXT: vpextrb $4, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_42 +; AVX512F-NEXT: ## BB#41: ## %cond.load58 +; AVX512F-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: LBB55_42: ## %else59 +; AVX512F-NEXT: vpextrb $5, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_44 +; AVX512F-NEXT: ## BB#43: ## %cond.load61 +; AVX512F-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: LBB55_44: ## %else62 +; AVX512F-NEXT: vpextrb $6, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_46 +; AVX512F-NEXT: ## BB#45: ## %cond.load64 +; AVX512F-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: LBB55_46: ## %else65 +; AVX512F-NEXT: vpextrb $7, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_48 +; AVX512F-NEXT: ## BB#47: ## %cond.load67 +; AVX512F-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: LBB55_48: ## %else68 +; AVX512F-NEXT: vpextrb $8, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_50 +; AVX512F-NEXT: ## BB#49: ## %cond.load70 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_50: ## %else71 +; AVX512F-NEXT: vpextrb $9, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_52 +; AVX512F-NEXT: ## BB#51: ## %cond.load73 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_52: ## %else74 +; AVX512F-NEXT: vpextrb $10, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_54 +; AVX512F-NEXT: ## BB#53: ## %cond.load76 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_54: ## %else77 +; AVX512F-NEXT: vpextrb $11, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_56 +; AVX512F-NEXT: ## BB#55: ## %cond.load79 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_56: ## %else80 +; AVX512F-NEXT: vpextrb $12, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_58 +; AVX512F-NEXT: ## BB#57: ## %cond.load82 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_58: ## %else83 +; AVX512F-NEXT: vpextrb $13, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_60 +; AVX512F-NEXT: ## BB#59: ## %cond.load85 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_60: ## %else86 +; AVX512F-NEXT: vpextrb $14, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_62 +; AVX512F-NEXT: ## BB#61: ## %cond.load88 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_62: ## %else89 +; AVX512F-NEXT: vpextrb $15, %xmm4, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB55_64 +; AVX512F-NEXT: ## BB#63: ## %cond.load91 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-NEXT: LBB55_64: ## %else92 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512F-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_load_32xi16: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -2295,6 +6917,241 @@ define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>) define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) { +; AVX-LABEL: test_mask_store_16xi8: +; AVX: ## BB#0: +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_2 +; AVX-NEXT: ## BB#1: ## %cond.store +; AVX-NEXT: vpextrb $0, %xmm1, (%rdi) +; AVX-NEXT: LBB56_2: ## %else +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_4 +; AVX-NEXT: ## BB#3: ## %cond.store1 +; AVX-NEXT: vpextrb $1, %xmm1, 1(%rdi) +; AVX-NEXT: LBB56_4: ## %else2 +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_6 +; AVX-NEXT: ## BB#5: ## %cond.store3 +; AVX-NEXT: vpextrb $2, %xmm1, 2(%rdi) +; AVX-NEXT: LBB56_6: ## %else4 +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_8 +; AVX-NEXT: ## BB#7: ## %cond.store5 +; AVX-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX-NEXT: LBB56_8: ## %else6 +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_10 +; AVX-NEXT: ## BB#9: ## %cond.store7 +; AVX-NEXT: vpextrb $4, %xmm1, 4(%rdi) +; AVX-NEXT: LBB56_10: ## %else8 +; AVX-NEXT: vpextrb $5, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_12 +; AVX-NEXT: ## BB#11: ## %cond.store9 +; AVX-NEXT: vpextrb $5, %xmm1, 5(%rdi) +; AVX-NEXT: LBB56_12: ## %else10 +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_14 +; AVX-NEXT: ## BB#13: ## %cond.store11 +; AVX-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX-NEXT: LBB56_14: ## %else12 +; AVX-NEXT: vpextrb $7, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_16 +; AVX-NEXT: ## BB#15: ## %cond.store13 +; AVX-NEXT: vpextrb $7, %xmm1, 7(%rdi) +; AVX-NEXT: LBB56_16: ## %else14 +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_18 +; AVX-NEXT: ## BB#17: ## %cond.store15 +; AVX-NEXT: vpextrb $8, %xmm1, 8(%rdi) +; AVX-NEXT: LBB56_18: ## %else16 +; AVX-NEXT: vpextrb $9, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_20 +; AVX-NEXT: ## BB#19: ## %cond.store17 +; AVX-NEXT: vpextrb $9, %xmm1, 9(%rdi) +; AVX-NEXT: LBB56_20: ## %else18 +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_22 +; AVX-NEXT: ## BB#21: ## %cond.store19 +; AVX-NEXT: vpextrb $10, %xmm1, 10(%rdi) +; AVX-NEXT: LBB56_22: ## %else20 +; AVX-NEXT: vpextrb $11, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_24 +; AVX-NEXT: ## BB#23: ## %cond.store21 +; AVX-NEXT: vpextrb $11, %xmm1, 11(%rdi) +; AVX-NEXT: LBB56_24: ## %else22 +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_26 +; AVX-NEXT: ## BB#25: ## %cond.store23 +; AVX-NEXT: vpextrb $12, %xmm1, 12(%rdi) +; AVX-NEXT: LBB56_26: ## %else24 +; AVX-NEXT: vpextrb $13, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_28 +; AVX-NEXT: ## BB#27: ## %cond.store25 +; AVX-NEXT: vpextrb $13, %xmm1, 13(%rdi) +; AVX-NEXT: LBB56_28: ## %else26 +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_30 +; AVX-NEXT: ## BB#29: ## %cond.store27 +; AVX-NEXT: vpextrb $14, %xmm1, 14(%rdi) +; AVX-NEXT: LBB56_30: ## %else28 +; AVX-NEXT: vpextrb $15, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB56_32 +; AVX-NEXT: ## BB#31: ## %cond.store29 +; AVX-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX-NEXT: LBB56_32: ## %else30 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test_mask_store_16xi8: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_2 +; AVX512F-NEXT: ## BB#1: ## %cond.store +; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi) +; AVX512F-NEXT: LBB56_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_4 +; AVX512F-NEXT: ## BB#3: ## %cond.store1 +; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi) +; AVX512F-NEXT: LBB56_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_6 +; AVX512F-NEXT: ## BB#5: ## %cond.store3 +; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi) +; AVX512F-NEXT: LBB56_6: ## %else4 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_8 +; AVX512F-NEXT: ## BB#7: ## %cond.store5 +; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX512F-NEXT: LBB56_8: ## %else6 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_10 +; AVX512F-NEXT: ## BB#9: ## %cond.store7 +; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi) +; AVX512F-NEXT: LBB56_10: ## %else8 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_12 +; AVX512F-NEXT: ## BB#11: ## %cond.store9 +; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi) +; AVX512F-NEXT: LBB56_12: ## %else10 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_14 +; AVX512F-NEXT: ## BB#13: ## %cond.store11 +; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX512F-NEXT: LBB56_14: ## %else12 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_16 +; AVX512F-NEXT: ## BB#15: ## %cond.store13 +; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi) +; AVX512F-NEXT: LBB56_16: ## %else14 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_18 +; AVX512F-NEXT: ## BB#17: ## %cond.store15 +; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi) +; AVX512F-NEXT: LBB56_18: ## %else16 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_20 +; AVX512F-NEXT: ## BB#19: ## %cond.store17 +; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi) +; AVX512F-NEXT: LBB56_20: ## %else18 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_22 +; AVX512F-NEXT: ## BB#21: ## %cond.store19 +; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi) +; AVX512F-NEXT: LBB56_22: ## %else20 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_24 +; AVX512F-NEXT: ## BB#23: ## %cond.store21 +; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi) +; AVX512F-NEXT: LBB56_24: ## %else22 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_26 +; AVX512F-NEXT: ## BB#25: ## %cond.store23 +; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi) +; AVX512F-NEXT: LBB56_26: ## %else24 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_28 +; AVX512F-NEXT: ## BB#27: ## %cond.store25 +; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi) +; AVX512F-NEXT: LBB56_28: ## %else26 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_30 +; AVX512F-NEXT: ## BB#29: ## %cond.store27 +; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi) +; AVX512F-NEXT: LBB56_30: ## %else28 +; AVX512F-NEXT: kshiftlw $0, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB56_32 +; AVX512F-NEXT: ## BB#31: ## %cond.store29 +; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX512F-NEXT: LBB56_32: ## %else30 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_store_16xi8: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -2307,6 +7164,647 @@ define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) { +; AVX1-LABEL: test_mask_store_32xi8: +; AVX1: ## BB#0: +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_2 +; AVX1-NEXT: ## BB#1: ## %cond.store +; AVX1-NEXT: vpextrb $0, %xmm1, (%rdi) +; AVX1-NEXT: LBB57_2: ## %else +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_4 +; AVX1-NEXT: ## BB#3: ## %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm1, 1(%rdi) +; AVX1-NEXT: LBB57_4: ## %else2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_6 +; AVX1-NEXT: ## BB#5: ## %cond.store3 +; AVX1-NEXT: vpextrb $2, %xmm1, 2(%rdi) +; AVX1-NEXT: LBB57_6: ## %else4 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_8 +; AVX1-NEXT: ## BB#7: ## %cond.store5 +; AVX1-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX1-NEXT: LBB57_8: ## %else6 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_10 +; AVX1-NEXT: ## BB#9: ## %cond.store7 +; AVX1-NEXT: vpextrb $4, %xmm1, 4(%rdi) +; AVX1-NEXT: LBB57_10: ## %else8 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_12 +; AVX1-NEXT: ## BB#11: ## %cond.store9 +; AVX1-NEXT: vpextrb $5, %xmm1, 5(%rdi) +; AVX1-NEXT: LBB57_12: ## %else10 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_14 +; AVX1-NEXT: ## BB#13: ## %cond.store11 +; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX1-NEXT: LBB57_14: ## %else12 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_16 +; AVX1-NEXT: ## BB#15: ## %cond.store13 +; AVX1-NEXT: vpextrb $7, %xmm1, 7(%rdi) +; AVX1-NEXT: LBB57_16: ## %else14 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_18 +; AVX1-NEXT: ## BB#17: ## %cond.store15 +; AVX1-NEXT: vpextrb $8, %xmm1, 8(%rdi) +; AVX1-NEXT: LBB57_18: ## %else16 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_20 +; AVX1-NEXT: ## BB#19: ## %cond.store17 +; AVX1-NEXT: vpextrb $9, %xmm1, 9(%rdi) +; AVX1-NEXT: LBB57_20: ## %else18 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_22 +; AVX1-NEXT: ## BB#21: ## %cond.store19 +; AVX1-NEXT: vpextrb $10, %xmm1, 10(%rdi) +; AVX1-NEXT: LBB57_22: ## %else20 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_24 +; AVX1-NEXT: ## BB#23: ## %cond.store21 +; AVX1-NEXT: vpextrb $11, %xmm1, 11(%rdi) +; AVX1-NEXT: LBB57_24: ## %else22 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_26 +; AVX1-NEXT: ## BB#25: ## %cond.store23 +; AVX1-NEXT: vpextrb $12, %xmm1, 12(%rdi) +; AVX1-NEXT: LBB57_26: ## %else24 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_28 +; AVX1-NEXT: ## BB#27: ## %cond.store25 +; AVX1-NEXT: vpextrb $13, %xmm1, 13(%rdi) +; AVX1-NEXT: LBB57_28: ## %else26 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_30 +; AVX1-NEXT: ## BB#29: ## %cond.store27 +; AVX1-NEXT: vpextrb $14, %xmm1, 14(%rdi) +; AVX1-NEXT: LBB57_30: ## %else28 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_32 +; AVX1-NEXT: ## BB#31: ## %cond.store29 +; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX1-NEXT: LBB57_32: ## %else30 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_34 +; AVX1-NEXT: ## BB#33: ## %cond.store31 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rdi) +; AVX1-NEXT: LBB57_34: ## %else32 +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_36 +; AVX1-NEXT: ## BB#35: ## %cond.store33 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rdi) +; AVX1-NEXT: LBB57_36: ## %else34 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_38 +; AVX1-NEXT: ## BB#37: ## %cond.store35 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rdi) +; AVX1-NEXT: LBB57_38: ## %else36 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_40 +; AVX1-NEXT: ## BB#39: ## %cond.store37 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rdi) +; AVX1-NEXT: LBB57_40: ## %else38 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_42 +; AVX1-NEXT: ## BB#41: ## %cond.store39 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rdi) +; AVX1-NEXT: LBB57_42: ## %else40 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_44 +; AVX1-NEXT: ## BB#43: ## %cond.store41 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rdi) +; AVX1-NEXT: LBB57_44: ## %else42 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_46 +; AVX1-NEXT: ## BB#45: ## %cond.store43 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rdi) +; AVX1-NEXT: LBB57_46: ## %else44 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_48 +; AVX1-NEXT: ## BB#47: ## %cond.store45 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rdi) +; AVX1-NEXT: LBB57_48: ## %else46 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_50 +; AVX1-NEXT: ## BB#49: ## %cond.store47 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rdi) +; AVX1-NEXT: LBB57_50: ## %else48 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_52 +; AVX1-NEXT: ## BB#51: ## %cond.store49 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rdi) +; AVX1-NEXT: LBB57_52: ## %else50 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_54 +; AVX1-NEXT: ## BB#53: ## %cond.store51 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rdi) +; AVX1-NEXT: LBB57_54: ## %else52 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_56 +; AVX1-NEXT: ## BB#55: ## %cond.store53 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rdi) +; AVX1-NEXT: LBB57_56: ## %else54 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_58 +; AVX1-NEXT: ## BB#57: ## %cond.store55 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rdi) +; AVX1-NEXT: LBB57_58: ## %else56 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_60 +; AVX1-NEXT: ## BB#59: ## %cond.store57 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rdi) +; AVX1-NEXT: LBB57_60: ## %else58 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_62 +; AVX1-NEXT: ## BB#61: ## %cond.store59 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rdi) +; AVX1-NEXT: LBB57_62: ## %else60 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB57_64 +; AVX1-NEXT: ## BB#63: ## %cond.store61 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX1-NEXT: LBB57_64: ## %else62 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_store_32xi8: +; AVX2: ## BB#0: +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_2 +; AVX2-NEXT: ## BB#1: ## %cond.store +; AVX2-NEXT: vpextrb $0, %xmm1, (%rdi) +; AVX2-NEXT: LBB57_2: ## %else +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_4 +; AVX2-NEXT: ## BB#3: ## %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm1, 1(%rdi) +; AVX2-NEXT: LBB57_4: ## %else2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_6 +; AVX2-NEXT: ## BB#5: ## %cond.store3 +; AVX2-NEXT: vpextrb $2, %xmm1, 2(%rdi) +; AVX2-NEXT: LBB57_6: ## %else4 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_8 +; AVX2-NEXT: ## BB#7: ## %cond.store5 +; AVX2-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX2-NEXT: LBB57_8: ## %else6 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_10 +; AVX2-NEXT: ## BB#9: ## %cond.store7 +; AVX2-NEXT: vpextrb $4, %xmm1, 4(%rdi) +; AVX2-NEXT: LBB57_10: ## %else8 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_12 +; AVX2-NEXT: ## BB#11: ## %cond.store9 +; AVX2-NEXT: vpextrb $5, %xmm1, 5(%rdi) +; AVX2-NEXT: LBB57_12: ## %else10 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_14 +; AVX2-NEXT: ## BB#13: ## %cond.store11 +; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX2-NEXT: LBB57_14: ## %else12 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_16 +; AVX2-NEXT: ## BB#15: ## %cond.store13 +; AVX2-NEXT: vpextrb $7, %xmm1, 7(%rdi) +; AVX2-NEXT: LBB57_16: ## %else14 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_18 +; AVX2-NEXT: ## BB#17: ## %cond.store15 +; AVX2-NEXT: vpextrb $8, %xmm1, 8(%rdi) +; AVX2-NEXT: LBB57_18: ## %else16 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_20 +; AVX2-NEXT: ## BB#19: ## %cond.store17 +; AVX2-NEXT: vpextrb $9, %xmm1, 9(%rdi) +; AVX2-NEXT: LBB57_20: ## %else18 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_22 +; AVX2-NEXT: ## BB#21: ## %cond.store19 +; AVX2-NEXT: vpextrb $10, %xmm1, 10(%rdi) +; AVX2-NEXT: LBB57_22: ## %else20 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_24 +; AVX2-NEXT: ## BB#23: ## %cond.store21 +; AVX2-NEXT: vpextrb $11, %xmm1, 11(%rdi) +; AVX2-NEXT: LBB57_24: ## %else22 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_26 +; AVX2-NEXT: ## BB#25: ## %cond.store23 +; AVX2-NEXT: vpextrb $12, %xmm1, 12(%rdi) +; AVX2-NEXT: LBB57_26: ## %else24 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_28 +; AVX2-NEXT: ## BB#27: ## %cond.store25 +; AVX2-NEXT: vpextrb $13, %xmm1, 13(%rdi) +; AVX2-NEXT: LBB57_28: ## %else26 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_30 +; AVX2-NEXT: ## BB#29: ## %cond.store27 +; AVX2-NEXT: vpextrb $14, %xmm1, 14(%rdi) +; AVX2-NEXT: LBB57_30: ## %else28 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_32 +; AVX2-NEXT: ## BB#31: ## %cond.store29 +; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX2-NEXT: LBB57_32: ## %else30 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_34 +; AVX2-NEXT: ## BB#33: ## %cond.store31 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rdi) +; AVX2-NEXT: LBB57_34: ## %else32 +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_36 +; AVX2-NEXT: ## BB#35: ## %cond.store33 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rdi) +; AVX2-NEXT: LBB57_36: ## %else34 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_38 +; AVX2-NEXT: ## BB#37: ## %cond.store35 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rdi) +; AVX2-NEXT: LBB57_38: ## %else36 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_40 +; AVX2-NEXT: ## BB#39: ## %cond.store37 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rdi) +; AVX2-NEXT: LBB57_40: ## %else38 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_42 +; AVX2-NEXT: ## BB#41: ## %cond.store39 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rdi) +; AVX2-NEXT: LBB57_42: ## %else40 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_44 +; AVX2-NEXT: ## BB#43: ## %cond.store41 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rdi) +; AVX2-NEXT: LBB57_44: ## %else42 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_46 +; AVX2-NEXT: ## BB#45: ## %cond.store43 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rdi) +; AVX2-NEXT: LBB57_46: ## %else44 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_48 +; AVX2-NEXT: ## BB#47: ## %cond.store45 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rdi) +; AVX2-NEXT: LBB57_48: ## %else46 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_50 +; AVX2-NEXT: ## BB#49: ## %cond.store47 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rdi) +; AVX2-NEXT: LBB57_50: ## %else48 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_52 +; AVX2-NEXT: ## BB#51: ## %cond.store49 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rdi) +; AVX2-NEXT: LBB57_52: ## %else50 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_54 +; AVX2-NEXT: ## BB#53: ## %cond.store51 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rdi) +; AVX2-NEXT: LBB57_54: ## %else52 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_56 +; AVX2-NEXT: ## BB#55: ## %cond.store53 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rdi) +; AVX2-NEXT: LBB57_56: ## %else54 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_58 +; AVX2-NEXT: ## BB#57: ## %cond.store55 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rdi) +; AVX2-NEXT: LBB57_58: ## %else56 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_60 +; AVX2-NEXT: ## BB#59: ## %cond.store57 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rdi) +; AVX2-NEXT: LBB57_60: ## %else58 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_62 +; AVX2-NEXT: ## BB#61: ## %cond.store59 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rdi) +; AVX2-NEXT: LBB57_62: ## %else60 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB57_64 +; AVX2-NEXT: ## BB#63: ## %cond.store61 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX2-NEXT: LBB57_64: ## %else62 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_store_32xi8: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpextrb $0, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_2 +; AVX512F-NEXT: ## BB#1: ## %cond.store +; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi) +; AVX512F-NEXT: LBB57_2: ## %else +; AVX512F-NEXT: vpextrb $1, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_4 +; AVX512F-NEXT: ## BB#3: ## %cond.store1 +; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi) +; AVX512F-NEXT: LBB57_4: ## %else2 +; AVX512F-NEXT: vpextrb $2, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_6 +; AVX512F-NEXT: ## BB#5: ## %cond.store3 +; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi) +; AVX512F-NEXT: LBB57_6: ## %else4 +; AVX512F-NEXT: vpextrb $3, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_8 +; AVX512F-NEXT: ## BB#7: ## %cond.store5 +; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX512F-NEXT: LBB57_8: ## %else6 +; AVX512F-NEXT: vpextrb $4, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_10 +; AVX512F-NEXT: ## BB#9: ## %cond.store7 +; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi) +; AVX512F-NEXT: LBB57_10: ## %else8 +; AVX512F-NEXT: vpextrb $5, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_12 +; AVX512F-NEXT: ## BB#11: ## %cond.store9 +; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi) +; AVX512F-NEXT: LBB57_12: ## %else10 +; AVX512F-NEXT: vpextrb $6, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_14 +; AVX512F-NEXT: ## BB#13: ## %cond.store11 +; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX512F-NEXT: LBB57_14: ## %else12 +; AVX512F-NEXT: vpextrb $7, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_16 +; AVX512F-NEXT: ## BB#15: ## %cond.store13 +; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi) +; AVX512F-NEXT: LBB57_16: ## %else14 +; AVX512F-NEXT: vpextrb $8, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_18 +; AVX512F-NEXT: ## BB#17: ## %cond.store15 +; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi) +; AVX512F-NEXT: LBB57_18: ## %else16 +; AVX512F-NEXT: vpextrb $9, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_20 +; AVX512F-NEXT: ## BB#19: ## %cond.store17 +; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi) +; AVX512F-NEXT: LBB57_20: ## %else18 +; AVX512F-NEXT: vpextrb $10, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_22 +; AVX512F-NEXT: ## BB#21: ## %cond.store19 +; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi) +; AVX512F-NEXT: LBB57_22: ## %else20 +; AVX512F-NEXT: vpextrb $11, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_24 +; AVX512F-NEXT: ## BB#23: ## %cond.store21 +; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi) +; AVX512F-NEXT: LBB57_24: ## %else22 +; AVX512F-NEXT: vpextrb $12, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_26 +; AVX512F-NEXT: ## BB#25: ## %cond.store23 +; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi) +; AVX512F-NEXT: LBB57_26: ## %else24 +; AVX512F-NEXT: vpextrb $13, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_28 +; AVX512F-NEXT: ## BB#27: ## %cond.store25 +; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi) +; AVX512F-NEXT: LBB57_28: ## %else26 +; AVX512F-NEXT: vpextrb $14, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_30 +; AVX512F-NEXT: ## BB#29: ## %cond.store27 +; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi) +; AVX512F-NEXT: LBB57_30: ## %else28 +; AVX512F-NEXT: vpextrb $15, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_32 +; AVX512F-NEXT: ## BB#31: ## %cond.store29 +; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX512F-NEXT: LBB57_32: ## %else30 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpextrb $0, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_34 +; AVX512F-NEXT: ## BB#33: ## %cond.store31 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $0, %xmm2, 16(%rdi) +; AVX512F-NEXT: LBB57_34: ## %else32 +; AVX512F-NEXT: vpextrb $1, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_36 +; AVX512F-NEXT: ## BB#35: ## %cond.store33 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $1, %xmm2, 17(%rdi) +; AVX512F-NEXT: LBB57_36: ## %else34 +; AVX512F-NEXT: vpextrb $2, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_38 +; AVX512F-NEXT: ## BB#37: ## %cond.store35 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $2, %xmm2, 18(%rdi) +; AVX512F-NEXT: LBB57_38: ## %else36 +; AVX512F-NEXT: vpextrb $3, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_40 +; AVX512F-NEXT: ## BB#39: ## %cond.store37 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $3, %xmm2, 19(%rdi) +; AVX512F-NEXT: LBB57_40: ## %else38 +; AVX512F-NEXT: vpextrb $4, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_42 +; AVX512F-NEXT: ## BB#41: ## %cond.store39 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $4, %xmm2, 20(%rdi) +; AVX512F-NEXT: LBB57_42: ## %else40 +; AVX512F-NEXT: vpextrb $5, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_44 +; AVX512F-NEXT: ## BB#43: ## %cond.store41 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $5, %xmm2, 21(%rdi) +; AVX512F-NEXT: LBB57_44: ## %else42 +; AVX512F-NEXT: vpextrb $6, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_46 +; AVX512F-NEXT: ## BB#45: ## %cond.store43 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $6, %xmm2, 22(%rdi) +; AVX512F-NEXT: LBB57_46: ## %else44 +; AVX512F-NEXT: vpextrb $7, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_48 +; AVX512F-NEXT: ## BB#47: ## %cond.store45 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $7, %xmm2, 23(%rdi) +; AVX512F-NEXT: LBB57_48: ## %else46 +; AVX512F-NEXT: vpextrb $8, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_50 +; AVX512F-NEXT: ## BB#49: ## %cond.store47 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $8, %xmm2, 24(%rdi) +; AVX512F-NEXT: LBB57_50: ## %else48 +; AVX512F-NEXT: vpextrb $9, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_52 +; AVX512F-NEXT: ## BB#51: ## %cond.store49 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $9, %xmm2, 25(%rdi) +; AVX512F-NEXT: LBB57_52: ## %else50 +; AVX512F-NEXT: vpextrb $10, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_54 +; AVX512F-NEXT: ## BB#53: ## %cond.store51 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $10, %xmm2, 26(%rdi) +; AVX512F-NEXT: LBB57_54: ## %else52 +; AVX512F-NEXT: vpextrb $11, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_56 +; AVX512F-NEXT: ## BB#55: ## %cond.store53 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $11, %xmm2, 27(%rdi) +; AVX512F-NEXT: LBB57_56: ## %else54 +; AVX512F-NEXT: vpextrb $12, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_58 +; AVX512F-NEXT: ## BB#57: ## %cond.store55 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $12, %xmm2, 28(%rdi) +; AVX512F-NEXT: LBB57_58: ## %else56 +; AVX512F-NEXT: vpextrb $13, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_60 +; AVX512F-NEXT: ## BB#59: ## %cond.store57 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $13, %xmm2, 29(%rdi) +; AVX512F-NEXT: LBB57_60: ## %else58 +; AVX512F-NEXT: vpextrb $14, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_62 +; AVX512F-NEXT: ## BB#61: ## %cond.store59 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $14, %xmm2, 30(%rdi) +; AVX512F-NEXT: LBB57_62: ## %else60 +; AVX512F-NEXT: vpextrb $15, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB57_64 +; AVX512F-NEXT: ## BB#63: ## %cond.store61 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX512F-NEXT: LBB57_64: ## %else62 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_store_32xi8: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -2319,6 +7817,1398 @@ define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>) define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) { +; AVX1-LABEL: test_mask_store_64xi8: +; AVX1: ## BB#0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: testb $1, %dil +; AVX1-NEXT: je LBB58_2 +; AVX1-NEXT: ## BB#1: ## %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rax) +; AVX1-NEXT: LBB58_2: ## %else +; AVX1-NEXT: testb $1, %sil +; AVX1-NEXT: je LBB58_4 +; AVX1-NEXT: ## BB#3: ## %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rax) +; AVX1-NEXT: LBB58_4: ## %else2 +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_6 +; AVX1-NEXT: ## BB#5: ## %cond.store3 +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rax) +; AVX1-NEXT: LBB58_6: ## %else4 +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_8 +; AVX1-NEXT: ## BB#7: ## %cond.store5 +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rax) +; AVX1-NEXT: LBB58_8: ## %else6 +; AVX1-NEXT: testb $1, %r8b +; AVX1-NEXT: je LBB58_10 +; AVX1-NEXT: ## BB#9: ## %cond.store7 +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rax) +; AVX1-NEXT: LBB58_10: ## %else8 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %r9b +; AVX1-NEXT: je LBB58_12 +; AVX1-NEXT: ## BB#11: ## %cond.store9 +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rax) +; AVX1-NEXT: LBB58_12: ## %else10 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_14 +; AVX1-NEXT: ## BB#13: ## %cond.store11 +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rax) +; AVX1-NEXT: LBB58_14: ## %else12 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_16 +; AVX1-NEXT: ## BB#15: ## %cond.store13 +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rax) +; AVX1-NEXT: LBB58_16: ## %else14 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_18 +; AVX1-NEXT: ## BB#17: ## %cond.store15 +; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rax) +; AVX1-NEXT: LBB58_18: ## %else16 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_20 +; AVX1-NEXT: ## BB#19: ## %cond.store17 +; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rax) +; AVX1-NEXT: LBB58_20: ## %else18 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_22 +; AVX1-NEXT: ## BB#21: ## %cond.store19 +; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rax) +; AVX1-NEXT: LBB58_22: ## %else20 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_24 +; AVX1-NEXT: ## BB#23: ## %cond.store21 +; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rax) +; AVX1-NEXT: LBB58_24: ## %else22 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_26 +; AVX1-NEXT: ## BB#25: ## %cond.store23 +; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rax) +; AVX1-NEXT: LBB58_26: ## %else24 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_28 +; AVX1-NEXT: ## BB#27: ## %cond.store25 +; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rax) +; AVX1-NEXT: LBB58_28: ## %else26 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_30 +; AVX1-NEXT: ## BB#29: ## %cond.store27 +; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rax) +; AVX1-NEXT: LBB58_30: ## %else28 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_32 +; AVX1-NEXT: ## BB#31: ## %cond.store29 +; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rax) +; AVX1-NEXT: LBB58_32: ## %else30 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_34 +; AVX1-NEXT: ## BB#33: ## %cond.store31 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rax) +; AVX1-NEXT: LBB58_34: ## %else32 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_36 +; AVX1-NEXT: ## BB#35: ## %cond.store33 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rax) +; AVX1-NEXT: LBB58_36: ## %else34 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_38 +; AVX1-NEXT: ## BB#37: ## %cond.store35 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rax) +; AVX1-NEXT: LBB58_38: ## %else36 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_40 +; AVX1-NEXT: ## BB#39: ## %cond.store37 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rax) +; AVX1-NEXT: LBB58_40: ## %else38 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_42 +; AVX1-NEXT: ## BB#41: ## %cond.store39 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rax) +; AVX1-NEXT: LBB58_42: ## %else40 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_44 +; AVX1-NEXT: ## BB#43: ## %cond.store41 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rax) +; AVX1-NEXT: LBB58_44: ## %else42 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_46 +; AVX1-NEXT: ## BB#45: ## %cond.store43 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rax) +; AVX1-NEXT: LBB58_46: ## %else44 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_48 +; AVX1-NEXT: ## BB#47: ## %cond.store45 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rax) +; AVX1-NEXT: LBB58_48: ## %else46 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_50 +; AVX1-NEXT: ## BB#49: ## %cond.store47 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rax) +; AVX1-NEXT: LBB58_50: ## %else48 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_52 +; AVX1-NEXT: ## BB#51: ## %cond.store49 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rax) +; AVX1-NEXT: LBB58_52: ## %else50 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_54 +; AVX1-NEXT: ## BB#53: ## %cond.store51 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rax) +; AVX1-NEXT: LBB58_54: ## %else52 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_56 +; AVX1-NEXT: ## BB#55: ## %cond.store53 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rax) +; AVX1-NEXT: LBB58_56: ## %else54 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_58 +; AVX1-NEXT: ## BB#57: ## %cond.store55 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rax) +; AVX1-NEXT: LBB58_58: ## %else56 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_60 +; AVX1-NEXT: ## BB#59: ## %cond.store57 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rax) +; AVX1-NEXT: LBB58_60: ## %else58 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_62 +; AVX1-NEXT: ## BB#61: ## %cond.store59 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rax) +; AVX1-NEXT: LBB58_62: ## %else60 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_64 +; AVX1-NEXT: ## BB#63: ## %cond.store61 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rax) +; AVX1-NEXT: LBB58_64: ## %else62 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_66 +; AVX1-NEXT: ## BB#65: ## %cond.store63 +; AVX1-NEXT: vpextrb $0, %xmm1, 32(%rax) +; AVX1-NEXT: LBB58_66: ## %else64 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_68 +; AVX1-NEXT: ## BB#67: ## %cond.store65 +; AVX1-NEXT: vpextrb $1, %xmm1, 33(%rax) +; AVX1-NEXT: LBB58_68: ## %else66 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_70 +; AVX1-NEXT: ## BB#69: ## %cond.store67 +; AVX1-NEXT: vpextrb $2, %xmm1, 34(%rax) +; AVX1-NEXT: LBB58_70: ## %else68 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_72 +; AVX1-NEXT: ## BB#71: ## %cond.store69 +; AVX1-NEXT: vpextrb $3, %xmm1, 35(%rax) +; AVX1-NEXT: LBB58_72: ## %else70 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_74 +; AVX1-NEXT: ## BB#73: ## %cond.store71 +; AVX1-NEXT: vpextrb $4, %xmm1, 36(%rax) +; AVX1-NEXT: LBB58_74: ## %else72 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_76 +; AVX1-NEXT: ## BB#75: ## %cond.store73 +; AVX1-NEXT: vpextrb $5, %xmm1, 37(%rax) +; AVX1-NEXT: LBB58_76: ## %else74 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_78 +; AVX1-NEXT: ## BB#77: ## %cond.store75 +; AVX1-NEXT: vpextrb $6, %xmm1, 38(%rax) +; AVX1-NEXT: LBB58_78: ## %else76 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_80 +; AVX1-NEXT: ## BB#79: ## %cond.store77 +; AVX1-NEXT: vpextrb $7, %xmm1, 39(%rax) +; AVX1-NEXT: LBB58_80: ## %else78 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_82 +; AVX1-NEXT: ## BB#81: ## %cond.store79 +; AVX1-NEXT: vpextrb $8, %xmm1, 40(%rax) +; AVX1-NEXT: LBB58_82: ## %else80 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_84 +; AVX1-NEXT: ## BB#83: ## %cond.store81 +; AVX1-NEXT: vpextrb $9, %xmm1, 41(%rax) +; AVX1-NEXT: LBB58_84: ## %else82 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_86 +; AVX1-NEXT: ## BB#85: ## %cond.store83 +; AVX1-NEXT: vpextrb $10, %xmm1, 42(%rax) +; AVX1-NEXT: LBB58_86: ## %else84 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_88 +; AVX1-NEXT: ## BB#87: ## %cond.store85 +; AVX1-NEXT: vpextrb $11, %xmm1, 43(%rax) +; AVX1-NEXT: LBB58_88: ## %else86 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_90 +; AVX1-NEXT: ## BB#89: ## %cond.store87 +; AVX1-NEXT: vpextrb $12, %xmm1, 44(%rax) +; AVX1-NEXT: LBB58_90: ## %else88 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_92 +; AVX1-NEXT: ## BB#91: ## %cond.store89 +; AVX1-NEXT: vpextrb $13, %xmm1, 45(%rax) +; AVX1-NEXT: LBB58_92: ## %else90 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_94 +; AVX1-NEXT: ## BB#93: ## %cond.store91 +; AVX1-NEXT: vpextrb $14, %xmm1, 46(%rax) +; AVX1-NEXT: LBB58_94: ## %else92 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_96 +; AVX1-NEXT: ## BB#95: ## %cond.store93 +; AVX1-NEXT: vpextrb $15, %xmm1, 47(%rax) +; AVX1-NEXT: LBB58_96: ## %else94 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_98 +; AVX1-NEXT: ## BB#97: ## %cond.store95 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, 48(%rax) +; AVX1-NEXT: LBB58_98: ## %else96 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_100 +; AVX1-NEXT: ## BB#99: ## %cond.store97 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $1, %xmm0, 49(%rax) +; AVX1-NEXT: LBB58_100: ## %else98 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_102 +; AVX1-NEXT: ## BB#101: ## %cond.store99 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rax) +; AVX1-NEXT: LBB58_102: ## %else100 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_104 +; AVX1-NEXT: ## BB#103: ## %cond.store101 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $3, %xmm0, 51(%rax) +; AVX1-NEXT: LBB58_104: ## %else102 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_106 +; AVX1-NEXT: ## BB#105: ## %cond.store103 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $4, %xmm0, 52(%rax) +; AVX1-NEXT: LBB58_106: ## %else104 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_108 +; AVX1-NEXT: ## BB#107: ## %cond.store105 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $5, %xmm0, 53(%rax) +; AVX1-NEXT: LBB58_108: ## %else106 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_110 +; AVX1-NEXT: ## BB#109: ## %cond.store107 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $6, %xmm0, 54(%rax) +; AVX1-NEXT: LBB58_110: ## %else108 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_112 +; AVX1-NEXT: ## BB#111: ## %cond.store109 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $7, %xmm0, 55(%rax) +; AVX1-NEXT: LBB58_112: ## %else110 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_114 +; AVX1-NEXT: ## BB#113: ## %cond.store111 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $8, %xmm0, 56(%rax) +; AVX1-NEXT: LBB58_114: ## %else112 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_116 +; AVX1-NEXT: ## BB#115: ## %cond.store113 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $9, %xmm0, 57(%rax) +; AVX1-NEXT: LBB58_116: ## %else114 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_118 +; AVX1-NEXT: ## BB#117: ## %cond.store115 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $10, %xmm0, 58(%rax) +; AVX1-NEXT: LBB58_118: ## %else116 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_120 +; AVX1-NEXT: ## BB#119: ## %cond.store117 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $11, %xmm0, 59(%rax) +; AVX1-NEXT: LBB58_120: ## %else118 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_122 +; AVX1-NEXT: ## BB#121: ## %cond.store119 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $12, %xmm0, 60(%rax) +; AVX1-NEXT: LBB58_122: ## %else120 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_124 +; AVX1-NEXT: ## BB#123: ## %cond.store121 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $13, %xmm0, 61(%rax) +; AVX1-NEXT: LBB58_124: ## %else122 +; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX1-NEXT: testb $1, %cl +; AVX1-NEXT: je LBB58_126 +; AVX1-NEXT: ## BB#125: ## %cond.store123 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $14, %xmm0, 62(%rax) +; AVX1-NEXT: LBB58_126: ## %else124 +; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: je LBB58_128 +; AVX1-NEXT: ## BB#127: ## %cond.store125 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, 63(%rax) +; AVX1-NEXT: LBB58_128: ## %else126 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_store_64xi8: +; AVX2: ## BB#0: +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: je LBB58_2 +; AVX2-NEXT: ## BB#1: ## %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rax) +; AVX2-NEXT: LBB58_2: ## %else +; AVX2-NEXT: testb $1, %sil +; AVX2-NEXT: je LBB58_4 +; AVX2-NEXT: ## BB#3: ## %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rax) +; AVX2-NEXT: LBB58_4: ## %else2 +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_6 +; AVX2-NEXT: ## BB#5: ## %cond.store3 +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rax) +; AVX2-NEXT: LBB58_6: ## %else4 +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_8 +; AVX2-NEXT: ## BB#7: ## %cond.store5 +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rax) +; AVX2-NEXT: LBB58_8: ## %else6 +; AVX2-NEXT: testb $1, %r8b +; AVX2-NEXT: je LBB58_10 +; AVX2-NEXT: ## BB#9: ## %cond.store7 +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rax) +; AVX2-NEXT: LBB58_10: ## %else8 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %r9b +; AVX2-NEXT: je LBB58_12 +; AVX2-NEXT: ## BB#11: ## %cond.store9 +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rax) +; AVX2-NEXT: LBB58_12: ## %else10 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_14 +; AVX2-NEXT: ## BB#13: ## %cond.store11 +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rax) +; AVX2-NEXT: LBB58_14: ## %else12 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_16 +; AVX2-NEXT: ## BB#15: ## %cond.store13 +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rax) +; AVX2-NEXT: LBB58_16: ## %else14 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_18 +; AVX2-NEXT: ## BB#17: ## %cond.store15 +; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rax) +; AVX2-NEXT: LBB58_18: ## %else16 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_20 +; AVX2-NEXT: ## BB#19: ## %cond.store17 +; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rax) +; AVX2-NEXT: LBB58_20: ## %else18 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_22 +; AVX2-NEXT: ## BB#21: ## %cond.store19 +; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rax) +; AVX2-NEXT: LBB58_22: ## %else20 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_24 +; AVX2-NEXT: ## BB#23: ## %cond.store21 +; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rax) +; AVX2-NEXT: LBB58_24: ## %else22 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_26 +; AVX2-NEXT: ## BB#25: ## %cond.store23 +; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rax) +; AVX2-NEXT: LBB58_26: ## %else24 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_28 +; AVX2-NEXT: ## BB#27: ## %cond.store25 +; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rax) +; AVX2-NEXT: LBB58_28: ## %else26 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_30 +; AVX2-NEXT: ## BB#29: ## %cond.store27 +; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rax) +; AVX2-NEXT: LBB58_30: ## %else28 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_32 +; AVX2-NEXT: ## BB#31: ## %cond.store29 +; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rax) +; AVX2-NEXT: LBB58_32: ## %else30 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_34 +; AVX2-NEXT: ## BB#33: ## %cond.store31 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rax) +; AVX2-NEXT: LBB58_34: ## %else32 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_36 +; AVX2-NEXT: ## BB#35: ## %cond.store33 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rax) +; AVX2-NEXT: LBB58_36: ## %else34 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_38 +; AVX2-NEXT: ## BB#37: ## %cond.store35 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rax) +; AVX2-NEXT: LBB58_38: ## %else36 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_40 +; AVX2-NEXT: ## BB#39: ## %cond.store37 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rax) +; AVX2-NEXT: LBB58_40: ## %else38 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_42 +; AVX2-NEXT: ## BB#41: ## %cond.store39 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rax) +; AVX2-NEXT: LBB58_42: ## %else40 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_44 +; AVX2-NEXT: ## BB#43: ## %cond.store41 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rax) +; AVX2-NEXT: LBB58_44: ## %else42 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_46 +; AVX2-NEXT: ## BB#45: ## %cond.store43 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rax) +; AVX2-NEXT: LBB58_46: ## %else44 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_48 +; AVX2-NEXT: ## BB#47: ## %cond.store45 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rax) +; AVX2-NEXT: LBB58_48: ## %else46 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_50 +; AVX2-NEXT: ## BB#49: ## %cond.store47 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rax) +; AVX2-NEXT: LBB58_50: ## %else48 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_52 +; AVX2-NEXT: ## BB#51: ## %cond.store49 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rax) +; AVX2-NEXT: LBB58_52: ## %else50 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_54 +; AVX2-NEXT: ## BB#53: ## %cond.store51 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rax) +; AVX2-NEXT: LBB58_54: ## %else52 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_56 +; AVX2-NEXT: ## BB#55: ## %cond.store53 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rax) +; AVX2-NEXT: LBB58_56: ## %else54 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_58 +; AVX2-NEXT: ## BB#57: ## %cond.store55 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rax) +; AVX2-NEXT: LBB58_58: ## %else56 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_60 +; AVX2-NEXT: ## BB#59: ## %cond.store57 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rax) +; AVX2-NEXT: LBB58_60: ## %else58 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_62 +; AVX2-NEXT: ## BB#61: ## %cond.store59 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rax) +; AVX2-NEXT: LBB58_62: ## %else60 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_64 +; AVX2-NEXT: ## BB#63: ## %cond.store61 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rax) +; AVX2-NEXT: LBB58_64: ## %else62 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_66 +; AVX2-NEXT: ## BB#65: ## %cond.store63 +; AVX2-NEXT: vpextrb $0, %xmm1, 32(%rax) +; AVX2-NEXT: LBB58_66: ## %else64 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_68 +; AVX2-NEXT: ## BB#67: ## %cond.store65 +; AVX2-NEXT: vpextrb $1, %xmm1, 33(%rax) +; AVX2-NEXT: LBB58_68: ## %else66 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_70 +; AVX2-NEXT: ## BB#69: ## %cond.store67 +; AVX2-NEXT: vpextrb $2, %xmm1, 34(%rax) +; AVX2-NEXT: LBB58_70: ## %else68 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_72 +; AVX2-NEXT: ## BB#71: ## %cond.store69 +; AVX2-NEXT: vpextrb $3, %xmm1, 35(%rax) +; AVX2-NEXT: LBB58_72: ## %else70 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_74 +; AVX2-NEXT: ## BB#73: ## %cond.store71 +; AVX2-NEXT: vpextrb $4, %xmm1, 36(%rax) +; AVX2-NEXT: LBB58_74: ## %else72 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_76 +; AVX2-NEXT: ## BB#75: ## %cond.store73 +; AVX2-NEXT: vpextrb $5, %xmm1, 37(%rax) +; AVX2-NEXT: LBB58_76: ## %else74 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_78 +; AVX2-NEXT: ## BB#77: ## %cond.store75 +; AVX2-NEXT: vpextrb $6, %xmm1, 38(%rax) +; AVX2-NEXT: LBB58_78: ## %else76 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_80 +; AVX2-NEXT: ## BB#79: ## %cond.store77 +; AVX2-NEXT: vpextrb $7, %xmm1, 39(%rax) +; AVX2-NEXT: LBB58_80: ## %else78 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_82 +; AVX2-NEXT: ## BB#81: ## %cond.store79 +; AVX2-NEXT: vpextrb $8, %xmm1, 40(%rax) +; AVX2-NEXT: LBB58_82: ## %else80 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_84 +; AVX2-NEXT: ## BB#83: ## %cond.store81 +; AVX2-NEXT: vpextrb $9, %xmm1, 41(%rax) +; AVX2-NEXT: LBB58_84: ## %else82 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_86 +; AVX2-NEXT: ## BB#85: ## %cond.store83 +; AVX2-NEXT: vpextrb $10, %xmm1, 42(%rax) +; AVX2-NEXT: LBB58_86: ## %else84 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_88 +; AVX2-NEXT: ## BB#87: ## %cond.store85 +; AVX2-NEXT: vpextrb $11, %xmm1, 43(%rax) +; AVX2-NEXT: LBB58_88: ## %else86 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_90 +; AVX2-NEXT: ## BB#89: ## %cond.store87 +; AVX2-NEXT: vpextrb $12, %xmm1, 44(%rax) +; AVX2-NEXT: LBB58_90: ## %else88 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_92 +; AVX2-NEXT: ## BB#91: ## %cond.store89 +; AVX2-NEXT: vpextrb $13, %xmm1, 45(%rax) +; AVX2-NEXT: LBB58_92: ## %else90 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_94 +; AVX2-NEXT: ## BB#93: ## %cond.store91 +; AVX2-NEXT: vpextrb $14, %xmm1, 46(%rax) +; AVX2-NEXT: LBB58_94: ## %else92 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_96 +; AVX2-NEXT: ## BB#95: ## %cond.store93 +; AVX2-NEXT: vpextrb $15, %xmm1, 47(%rax) +; AVX2-NEXT: LBB58_96: ## %else94 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_98 +; AVX2-NEXT: ## BB#97: ## %cond.store95 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, 48(%rax) +; AVX2-NEXT: LBB58_98: ## %else96 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_100 +; AVX2-NEXT: ## BB#99: ## %cond.store97 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, 49(%rax) +; AVX2-NEXT: LBB58_100: ## %else98 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_102 +; AVX2-NEXT: ## BB#101: ## %cond.store99 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $2, %xmm0, 50(%rax) +; AVX2-NEXT: LBB58_102: ## %else100 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_104 +; AVX2-NEXT: ## BB#103: ## %cond.store101 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $3, %xmm0, 51(%rax) +; AVX2-NEXT: LBB58_104: ## %else102 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_106 +; AVX2-NEXT: ## BB#105: ## %cond.store103 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $4, %xmm0, 52(%rax) +; AVX2-NEXT: LBB58_106: ## %else104 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_108 +; AVX2-NEXT: ## BB#107: ## %cond.store105 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $5, %xmm0, 53(%rax) +; AVX2-NEXT: LBB58_108: ## %else106 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_110 +; AVX2-NEXT: ## BB#109: ## %cond.store107 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $6, %xmm0, 54(%rax) +; AVX2-NEXT: LBB58_110: ## %else108 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_112 +; AVX2-NEXT: ## BB#111: ## %cond.store109 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $7, %xmm0, 55(%rax) +; AVX2-NEXT: LBB58_112: ## %else110 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_114 +; AVX2-NEXT: ## BB#113: ## %cond.store111 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $8, %xmm0, 56(%rax) +; AVX2-NEXT: LBB58_114: ## %else112 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_116 +; AVX2-NEXT: ## BB#115: ## %cond.store113 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $9, %xmm0, 57(%rax) +; AVX2-NEXT: LBB58_116: ## %else114 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_118 +; AVX2-NEXT: ## BB#117: ## %cond.store115 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $10, %xmm0, 58(%rax) +; AVX2-NEXT: LBB58_118: ## %else116 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_120 +; AVX2-NEXT: ## BB#119: ## %cond.store117 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $11, %xmm0, 59(%rax) +; AVX2-NEXT: LBB58_120: ## %else118 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_122 +; AVX2-NEXT: ## BB#121: ## %cond.store119 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $12, %xmm0, 60(%rax) +; AVX2-NEXT: LBB58_122: ## %else120 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_124 +; AVX2-NEXT: ## BB#123: ## %cond.store121 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $13, %xmm0, 61(%rax) +; AVX2-NEXT: LBB58_124: ## %else122 +; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: je LBB58_126 +; AVX2-NEXT: ## BB#125: ## %cond.store123 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $14, %xmm0, 62(%rax) +; AVX2-NEXT: LBB58_126: ## %else124 +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: je LBB58_128 +; AVX2-NEXT: ## BB#127: ## %cond.store125 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, 63(%rax) +; AVX2-NEXT: LBB58_128: ## %else126 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_store_64xi8: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_2 +; AVX512F-NEXT: ## BB#1: ## %cond.store +; AVX512F-NEXT: vpextrb $0, %xmm4, (%rdi) +; AVX512F-NEXT: LBB58_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_4 +; AVX512F-NEXT: ## BB#3: ## %cond.store1 +; AVX512F-NEXT: vpextrb $1, %xmm4, 1(%rdi) +; AVX512F-NEXT: LBB58_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_6 +; AVX512F-NEXT: ## BB#5: ## %cond.store3 +; AVX512F-NEXT: vpextrb $2, %xmm4, 2(%rdi) +; AVX512F-NEXT: LBB58_6: ## %else4 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_8 +; AVX512F-NEXT: ## BB#7: ## %cond.store5 +; AVX512F-NEXT: vpextrb $3, %xmm4, 3(%rdi) +; AVX512F-NEXT: LBB58_8: ## %else6 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_10 +; AVX512F-NEXT: ## BB#9: ## %cond.store7 +; AVX512F-NEXT: vpextrb $4, %xmm4, 4(%rdi) +; AVX512F-NEXT: LBB58_10: ## %else8 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_12 +; AVX512F-NEXT: ## BB#11: ## %cond.store9 +; AVX512F-NEXT: vpextrb $5, %xmm4, 5(%rdi) +; AVX512F-NEXT: LBB58_12: ## %else10 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_14 +; AVX512F-NEXT: ## BB#13: ## %cond.store11 +; AVX512F-NEXT: vpextrb $6, %xmm4, 6(%rdi) +; AVX512F-NEXT: LBB58_14: ## %else12 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_16 +; AVX512F-NEXT: ## BB#15: ## %cond.store13 +; AVX512F-NEXT: vpextrb $7, %xmm4, 7(%rdi) +; AVX512F-NEXT: LBB58_16: ## %else14 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_18 +; AVX512F-NEXT: ## BB#17: ## %cond.store15 +; AVX512F-NEXT: vpextrb $8, %xmm4, 8(%rdi) +; AVX512F-NEXT: LBB58_18: ## %else16 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_20 +; AVX512F-NEXT: ## BB#19: ## %cond.store17 +; AVX512F-NEXT: vpextrb $9, %xmm4, 9(%rdi) +; AVX512F-NEXT: LBB58_20: ## %else18 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_22 +; AVX512F-NEXT: ## BB#21: ## %cond.store19 +; AVX512F-NEXT: vpextrb $10, %xmm4, 10(%rdi) +; AVX512F-NEXT: LBB58_22: ## %else20 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_24 +; AVX512F-NEXT: ## BB#23: ## %cond.store21 +; AVX512F-NEXT: vpextrb $11, %xmm4, 11(%rdi) +; AVX512F-NEXT: LBB58_24: ## %else22 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_26 +; AVX512F-NEXT: ## BB#25: ## %cond.store23 +; AVX512F-NEXT: vpextrb $12, %xmm4, 12(%rdi) +; AVX512F-NEXT: LBB58_26: ## %else24 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_28 +; AVX512F-NEXT: ## BB#27: ## %cond.store25 +; AVX512F-NEXT: vpextrb $13, %xmm4, 13(%rdi) +; AVX512F-NEXT: LBB58_28: ## %else26 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_30 +; AVX512F-NEXT: ## BB#29: ## %cond.store27 +; AVX512F-NEXT: vpextrb $14, %xmm4, 14(%rdi) +; AVX512F-NEXT: LBB58_30: ## %else28 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kshiftlw $0, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_32 +; AVX512F-NEXT: ## BB#31: ## %cond.store29 +; AVX512F-NEXT: vpextrb $15, %xmm4, 15(%rdi) +; AVX512F-NEXT: LBB58_32: ## %else30 +; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_34 +; AVX512F-NEXT: ## BB#33: ## %cond.store31 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi) +; AVX512F-NEXT: LBB58_34: ## %else32 +; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_36 +; AVX512F-NEXT: ## BB#35: ## %cond.store33 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi) +; AVX512F-NEXT: LBB58_36: ## %else34 +; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_38 +; AVX512F-NEXT: ## BB#37: ## %cond.store35 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi) +; AVX512F-NEXT: LBB58_38: ## %else36 +; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_40 +; AVX512F-NEXT: ## BB#39: ## %cond.store37 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi) +; AVX512F-NEXT: LBB58_40: ## %else38 +; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_42 +; AVX512F-NEXT: ## BB#41: ## %cond.store39 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi) +; AVX512F-NEXT: LBB58_42: ## %else40 +; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_44 +; AVX512F-NEXT: ## BB#43: ## %cond.store41 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi) +; AVX512F-NEXT: LBB58_44: ## %else42 +; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_46 +; AVX512F-NEXT: ## BB#45: ## %cond.store43 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi) +; AVX512F-NEXT: LBB58_46: ## %else44 +; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_48 +; AVX512F-NEXT: ## BB#47: ## %cond.store45 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi) +; AVX512F-NEXT: LBB58_48: ## %else46 +; AVX512F-NEXT: kshiftlw $7, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_50 +; AVX512F-NEXT: ## BB#49: ## %cond.store47 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi) +; AVX512F-NEXT: LBB58_50: ## %else48 +; AVX512F-NEXT: kshiftlw $6, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_52 +; AVX512F-NEXT: ## BB#51: ## %cond.store49 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi) +; AVX512F-NEXT: LBB58_52: ## %else50 +; AVX512F-NEXT: kshiftlw $5, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_54 +; AVX512F-NEXT: ## BB#53: ## %cond.store51 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi) +; AVX512F-NEXT: LBB58_54: ## %else52 +; AVX512F-NEXT: kshiftlw $4, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_56 +; AVX512F-NEXT: ## BB#55: ## %cond.store53 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi) +; AVX512F-NEXT: LBB58_56: ## %else54 +; AVX512F-NEXT: kshiftlw $3, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_58 +; AVX512F-NEXT: ## BB#57: ## %cond.store55 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi) +; AVX512F-NEXT: LBB58_58: ## %else56 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0 +; AVX512F-NEXT: kshiftlw $2, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_60 +; AVX512F-NEXT: ## BB#59: ## %cond.store57 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512F-NEXT: vpextrb $13, %xmm1, 29(%rdi) +; AVX512F-NEXT: LBB58_60: ## %else58 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: kshiftlw $1, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_62 +; AVX512F-NEXT: ## BB#61: ## %cond.store59 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512F-NEXT: vpextrb $14, %xmm1, 30(%rdi) +; AVX512F-NEXT: LBB58_62: ## %else60 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $0, %k1, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_64 +; AVX512F-NEXT: ## BB#63: ## %cond.store61 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX512F-NEXT: LBB58_64: ## %else62 +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_66 +; AVX512F-NEXT: ## BB#65: ## %cond.store63 +; AVX512F-NEXT: vpextrb $0, %xmm5, 32(%rdi) +; AVX512F-NEXT: LBB58_66: ## %else64 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_68 +; AVX512F-NEXT: ## BB#67: ## %cond.store65 +; AVX512F-NEXT: vpextrb $1, %xmm5, 33(%rdi) +; AVX512F-NEXT: LBB58_68: ## %else66 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_70 +; AVX512F-NEXT: ## BB#69: ## %cond.store67 +; AVX512F-NEXT: vpextrb $2, %xmm5, 34(%rdi) +; AVX512F-NEXT: LBB58_70: ## %else68 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_72 +; AVX512F-NEXT: ## BB#71: ## %cond.store69 +; AVX512F-NEXT: vpextrb $3, %xmm5, 35(%rdi) +; AVX512F-NEXT: LBB58_72: ## %else70 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_74 +; AVX512F-NEXT: ## BB#73: ## %cond.store71 +; AVX512F-NEXT: vpextrb $4, %xmm5, 36(%rdi) +; AVX512F-NEXT: LBB58_74: ## %else72 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_76 +; AVX512F-NEXT: ## BB#75: ## %cond.store73 +; AVX512F-NEXT: vpextrb $5, %xmm5, 37(%rdi) +; AVX512F-NEXT: LBB58_76: ## %else74 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_78 +; AVX512F-NEXT: ## BB#77: ## %cond.store75 +; AVX512F-NEXT: vpextrb $6, %xmm5, 38(%rdi) +; AVX512F-NEXT: LBB58_78: ## %else76 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_80 +; AVX512F-NEXT: ## BB#79: ## %cond.store77 +; AVX512F-NEXT: vpextrb $7, %xmm5, 39(%rdi) +; AVX512F-NEXT: LBB58_80: ## %else78 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_82 +; AVX512F-NEXT: ## BB#81: ## %cond.store79 +; AVX512F-NEXT: vpextrb $8, %xmm5, 40(%rdi) +; AVX512F-NEXT: LBB58_82: ## %else80 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_84 +; AVX512F-NEXT: ## BB#83: ## %cond.store81 +; AVX512F-NEXT: vpextrb $9, %xmm5, 41(%rdi) +; AVX512F-NEXT: LBB58_84: ## %else82 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_86 +; AVX512F-NEXT: ## BB#85: ## %cond.store83 +; AVX512F-NEXT: vpextrb $10, %xmm5, 42(%rdi) +; AVX512F-NEXT: LBB58_86: ## %else84 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_88 +; AVX512F-NEXT: ## BB#87: ## %cond.store85 +; AVX512F-NEXT: vpextrb $11, %xmm5, 43(%rdi) +; AVX512F-NEXT: LBB58_88: ## %else86 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_90 +; AVX512F-NEXT: ## BB#89: ## %cond.store87 +; AVX512F-NEXT: vpextrb $12, %xmm5, 44(%rdi) +; AVX512F-NEXT: LBB58_90: ## %else88 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm0 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_92 +; AVX512F-NEXT: ## BB#91: ## %cond.store89 +; AVX512F-NEXT: vpextrb $13, %xmm5, 45(%rdi) +; AVX512F-NEXT: LBB58_92: ## %else90 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_94 +; AVX512F-NEXT: ## BB#93: ## %cond.store91 +; AVX512F-NEXT: vpextrb $14, %xmm5, 46(%rdi) +; AVX512F-NEXT: LBB58_94: ## %else92 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: kshiftlw $0, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_96 +; AVX512F-NEXT: ## BB#95: ## %cond.store93 +; AVX512F-NEXT: vpextrb $15, %xmm5, 47(%rdi) +; AVX512F-NEXT: LBB58_96: ## %else94 +; AVX512F-NEXT: kshiftlw $15, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_98 +; AVX512F-NEXT: ## BB#97: ## %cond.store95 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $0, %xmm0, 48(%rdi) +; AVX512F-NEXT: LBB58_98: ## %else96 +; AVX512F-NEXT: kshiftlw $14, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_100 +; AVX512F-NEXT: ## BB#99: ## %cond.store97 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $1, %xmm0, 49(%rdi) +; AVX512F-NEXT: LBB58_100: ## %else98 +; AVX512F-NEXT: kshiftlw $13, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_102 +; AVX512F-NEXT: ## BB#101: ## %cond.store99 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $2, %xmm0, 50(%rdi) +; AVX512F-NEXT: LBB58_102: ## %else100 +; AVX512F-NEXT: kshiftlw $12, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_104 +; AVX512F-NEXT: ## BB#103: ## %cond.store101 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $3, %xmm0, 51(%rdi) +; AVX512F-NEXT: LBB58_104: ## %else102 +; AVX512F-NEXT: kshiftlw $11, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_106 +; AVX512F-NEXT: ## BB#105: ## %cond.store103 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $4, %xmm0, 52(%rdi) +; AVX512F-NEXT: LBB58_106: ## %else104 +; AVX512F-NEXT: kshiftlw $10, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_108 +; AVX512F-NEXT: ## BB#107: ## %cond.store105 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $5, %xmm0, 53(%rdi) +; AVX512F-NEXT: LBB58_108: ## %else106 +; AVX512F-NEXT: kshiftlw $9, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_110 +; AVX512F-NEXT: ## BB#109: ## %cond.store107 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $6, %xmm0, 54(%rdi) +; AVX512F-NEXT: LBB58_110: ## %else108 +; AVX512F-NEXT: kshiftlw $8, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_112 +; AVX512F-NEXT: ## BB#111: ## %cond.store109 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $7, %xmm0, 55(%rdi) +; AVX512F-NEXT: LBB58_112: ## %else110 +; AVX512F-NEXT: kshiftlw $7, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_114 +; AVX512F-NEXT: ## BB#113: ## %cond.store111 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $8, %xmm0, 56(%rdi) +; AVX512F-NEXT: LBB58_114: ## %else112 +; AVX512F-NEXT: kshiftlw $6, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_116 +; AVX512F-NEXT: ## BB#115: ## %cond.store113 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $9, %xmm0, 57(%rdi) +; AVX512F-NEXT: LBB58_116: ## %else114 +; AVX512F-NEXT: kshiftlw $5, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_118 +; AVX512F-NEXT: ## BB#117: ## %cond.store115 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $10, %xmm0, 58(%rdi) +; AVX512F-NEXT: LBB58_118: ## %else116 +; AVX512F-NEXT: kshiftlw $4, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_120 +; AVX512F-NEXT: ## BB#119: ## %cond.store117 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $11, %xmm0, 59(%rdi) +; AVX512F-NEXT: LBB58_120: ## %else118 +; AVX512F-NEXT: kshiftlw $3, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_122 +; AVX512F-NEXT: ## BB#121: ## %cond.store119 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $12, %xmm0, 60(%rdi) +; AVX512F-NEXT: LBB58_122: ## %else120 +; AVX512F-NEXT: kshiftlw $2, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_124 +; AVX512F-NEXT: ## BB#123: ## %cond.store121 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $13, %xmm0, 61(%rdi) +; AVX512F-NEXT: LBB58_124: ## %else122 +; AVX512F-NEXT: kshiftlw $1, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_126 +; AVX512F-NEXT: ## BB#125: ## %cond.store123 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $14, %xmm0, 62(%rdi) +; AVX512F-NEXT: LBB58_126: ## %else124 +; AVX512F-NEXT: kshiftlw $0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB58_128 +; AVX512F-NEXT: ## BB#127: ## %cond.store125 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512F-NEXT: vpextrb $15, %xmm0, 63(%rdi) +; AVX512F-NEXT: LBB58_128: ## %else126 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_store_64xi8: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 @@ -2331,6 +9221,131 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>) define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) { +; AVX-LABEL: test_mask_store_8xi16: +; AVX: ## BB#0: +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_2 +; AVX-NEXT: ## BB#1: ## %cond.store +; AVX-NEXT: vmovd %xmm1, %eax +; AVX-NEXT: movw %ax, (%rdi) +; AVX-NEXT: LBB59_2: ## %else +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_4 +; AVX-NEXT: ## BB#3: ## %cond.store1 +; AVX-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX-NEXT: LBB59_4: ## %else2 +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_6 +; AVX-NEXT: ## BB#5: ## %cond.store3 +; AVX-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX-NEXT: LBB59_6: ## %else4 +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_8 +; AVX-NEXT: ## BB#7: ## %cond.store5 +; AVX-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX-NEXT: LBB59_8: ## %else6 +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_10 +; AVX-NEXT: ## BB#9: ## %cond.store7 +; AVX-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX-NEXT: LBB59_10: ## %else8 +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_12 +; AVX-NEXT: ## BB#11: ## %cond.store9 +; AVX-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX-NEXT: LBB59_12: ## %else10 +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_14 +; AVX-NEXT: ## BB#13: ## %cond.store11 +; AVX-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX-NEXT: LBB59_14: ## %else12 +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB59_16 +; AVX-NEXT: ## BB#15: ## %cond.store13 +; AVX-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX-NEXT: LBB59_16: ## %else14 +; AVX-NEXT: retq +; +; AVX512F-LABEL: test_mask_store_8xi16: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_2 +; AVX512F-NEXT: ## BB#1: ## %cond.store +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: LBB59_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_4 +; AVX512F-NEXT: ## BB#3: ## %cond.store1 +; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX512F-NEXT: LBB59_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_6 +; AVX512F-NEXT: ## BB#5: ## %cond.store3 +; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX512F-NEXT: LBB59_6: ## %else4 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_8 +; AVX512F-NEXT: ## BB#7: ## %cond.store5 +; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX512F-NEXT: LBB59_8: ## %else6 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_10 +; AVX512F-NEXT: ## BB#9: ## %cond.store7 +; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX512F-NEXT: LBB59_10: ## %else8 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_12 +; AVX512F-NEXT: ## BB#11: ## %cond.store9 +; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX512F-NEXT: LBB59_12: ## %else10 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_14 +; AVX512F-NEXT: ## BB#13: ## %cond.store11 +; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX512F-NEXT: LBB59_14: ## %else12 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB59_16 +; AVX512F-NEXT: ## BB#15: ## %cond.store13 +; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX512F-NEXT: LBB59_16: ## %else14 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_store_8xi16: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -2343,6 +9358,373 @@ define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> % declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) { +; AVX1-LABEL: test_mask_store_16xi16: +; AVX1: ## BB#0: +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_2 +; AVX1-NEXT: ## BB#1: ## %cond.store +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: movw %ax, (%rdi) +; AVX1-NEXT: LBB60_2: ## %else +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_4 +; AVX1-NEXT: ## BB#3: ## %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX1-NEXT: LBB60_4: ## %else2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_6 +; AVX1-NEXT: ## BB#5: ## %cond.store3 +; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX1-NEXT: LBB60_6: ## %else4 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_8 +; AVX1-NEXT: ## BB#7: ## %cond.store5 +; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX1-NEXT: LBB60_8: ## %else6 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_10 +; AVX1-NEXT: ## BB#9: ## %cond.store7 +; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX1-NEXT: LBB60_10: ## %else8 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_12 +; AVX1-NEXT: ## BB#11: ## %cond.store9 +; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX1-NEXT: LBB60_12: ## %else10 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_14 +; AVX1-NEXT: ## BB#13: ## %cond.store11 +; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX1-NEXT: LBB60_14: ## %else12 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_16 +; AVX1-NEXT: ## BB#15: ## %cond.store13 +; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX1-NEXT: LBB60_16: ## %else14 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_18 +; AVX1-NEXT: ## BB#17: ## %cond.store15 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: movw %ax, 16(%rdi) +; AVX1-NEXT: LBB60_18: ## %else16 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_20 +; AVX1-NEXT: ## BB#19: ## %cond.store17 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm2, 18(%rdi) +; AVX1-NEXT: LBB60_20: ## %else18 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_22 +; AVX1-NEXT: ## BB#21: ## %cond.store19 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm2, 20(%rdi) +; AVX1-NEXT: LBB60_22: ## %else20 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_24 +; AVX1-NEXT: ## BB#23: ## %cond.store21 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm2, 22(%rdi) +; AVX1-NEXT: LBB60_24: ## %else22 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_26 +; AVX1-NEXT: ## BB#25: ## %cond.store23 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm2, 24(%rdi) +; AVX1-NEXT: LBB60_26: ## %else24 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_28 +; AVX1-NEXT: ## BB#27: ## %cond.store25 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm2, 26(%rdi) +; AVX1-NEXT: LBB60_28: ## %else26 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_30 +; AVX1-NEXT: ## BB#29: ## %cond.store27 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm2, 28(%rdi) +; AVX1-NEXT: LBB60_30: ## %else28 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB60_32 +; AVX1-NEXT: ## BB#31: ## %cond.store29 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi) +; AVX1-NEXT: LBB60_32: ## %else30 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_store_16xi16: +; AVX2: ## BB#0: +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_2 +; AVX2-NEXT: ## BB#1: ## %cond.store +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: movw %ax, (%rdi) +; AVX2-NEXT: LBB60_2: ## %else +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_4 +; AVX2-NEXT: ## BB#3: ## %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX2-NEXT: LBB60_4: ## %else2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_6 +; AVX2-NEXT: ## BB#5: ## %cond.store3 +; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX2-NEXT: LBB60_6: ## %else4 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_8 +; AVX2-NEXT: ## BB#7: ## %cond.store5 +; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX2-NEXT: LBB60_8: ## %else6 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_10 +; AVX2-NEXT: ## BB#9: ## %cond.store7 +; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX2-NEXT: LBB60_10: ## %else8 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_12 +; AVX2-NEXT: ## BB#11: ## %cond.store9 +; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX2-NEXT: LBB60_12: ## %else10 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_14 +; AVX2-NEXT: ## BB#13: ## %cond.store11 +; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX2-NEXT: LBB60_14: ## %else12 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_16 +; AVX2-NEXT: ## BB#15: ## %cond.store13 +; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX2-NEXT: LBB60_16: ## %else14 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_18 +; AVX2-NEXT: ## BB#17: ## %cond.store15 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: movw %ax, 16(%rdi) +; AVX2-NEXT: LBB60_18: ## %else16 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_20 +; AVX2-NEXT: ## BB#19: ## %cond.store17 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm2, 18(%rdi) +; AVX2-NEXT: LBB60_20: ## %else18 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_22 +; AVX2-NEXT: ## BB#21: ## %cond.store19 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm2, 20(%rdi) +; AVX2-NEXT: LBB60_22: ## %else20 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_24 +; AVX2-NEXT: ## BB#23: ## %cond.store21 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm2, 22(%rdi) +; AVX2-NEXT: LBB60_24: ## %else22 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_26 +; AVX2-NEXT: ## BB#25: ## %cond.store23 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm2, 24(%rdi) +; AVX2-NEXT: LBB60_26: ## %else24 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_28 +; AVX2-NEXT: ## BB#27: ## %cond.store25 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm2, 26(%rdi) +; AVX2-NEXT: LBB60_28: ## %else26 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_30 +; AVX2-NEXT: ## BB#29: ## %cond.store27 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm2, 28(%rdi) +; AVX2-NEXT: LBB60_30: ## %else28 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB60_32 +; AVX2-NEXT: ## BB#31: ## %cond.store29 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi) +; AVX2-NEXT: LBB60_32: ## %else30 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_store_16xi16: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_2 +; AVX512F-NEXT: ## BB#1: ## %cond.store +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: LBB60_2: ## %else +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_4 +; AVX512F-NEXT: ## BB#3: ## %cond.store1 +; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX512F-NEXT: LBB60_4: ## %else2 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_6 +; AVX512F-NEXT: ## BB#5: ## %cond.store3 +; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX512F-NEXT: LBB60_6: ## %else4 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_8 +; AVX512F-NEXT: ## BB#7: ## %cond.store5 +; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX512F-NEXT: LBB60_8: ## %else6 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_10 +; AVX512F-NEXT: ## BB#9: ## %cond.store7 +; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX512F-NEXT: LBB60_10: ## %else8 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_12 +; AVX512F-NEXT: ## BB#11: ## %cond.store9 +; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX512F-NEXT: LBB60_12: ## %else10 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_14 +; AVX512F-NEXT: ## BB#13: ## %cond.store11 +; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX512F-NEXT: LBB60_14: ## %else12 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_16 +; AVX512F-NEXT: ## BB#15: ## %cond.store13 +; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX512F-NEXT: LBB60_16: ## %else14 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_18 +; AVX512F-NEXT: ## BB#17: ## %cond.store15 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: movw %ax, 16(%rdi) +; AVX512F-NEXT: LBB60_18: ## %else16 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_20 +; AVX512F-NEXT: ## BB#19: ## %cond.store17 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX512F-NEXT: LBB60_20: ## %else18 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_22 +; AVX512F-NEXT: ## BB#21: ## %cond.store19 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX512F-NEXT: LBB60_22: ## %else20 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_24 +; AVX512F-NEXT: ## BB#23: ## %cond.store21 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi) +; AVX512F-NEXT: LBB60_24: ## %else22 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_26 +; AVX512F-NEXT: ## BB#25: ## %cond.store23 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi) +; AVX512F-NEXT: LBB60_26: ## %else24 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_28 +; AVX512F-NEXT: ## BB#27: ## %cond.store25 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi) +; AVX512F-NEXT: LBB60_28: ## %else26 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_30 +; AVX512F-NEXT: ## BB#29: ## %cond.store27 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi) +; AVX512F-NEXT: LBB60_30: ## %else28 +; AVX512F-NEXT: kshiftlw $0, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: je LBB60_32 +; AVX512F-NEXT: ## BB#31: ## %cond.store29 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi) +; AVX512F-NEXT: LBB60_32: ## %else30 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_store_16xi16: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 @@ -2355,6 +9737,659 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1 declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>) define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) { +; AVX1-LABEL: test_mask_store_32xi16: +; AVX1: ## BB#0: +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_2 +; AVX1-NEXT: ## BB#1: ## %cond.store +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: movw %ax, (%rdi) +; AVX1-NEXT: LBB61_2: ## %else +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_4 +; AVX1-NEXT: ## BB#3: ## %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX1-NEXT: LBB61_4: ## %else2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_6 +; AVX1-NEXT: ## BB#5: ## %cond.store3 +; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX1-NEXT: LBB61_6: ## %else4 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_8 +; AVX1-NEXT: ## BB#7: ## %cond.store5 +; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX1-NEXT: LBB61_8: ## %else6 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_10 +; AVX1-NEXT: ## BB#9: ## %cond.store7 +; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX1-NEXT: LBB61_10: ## %else8 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_12 +; AVX1-NEXT: ## BB#11: ## %cond.store9 +; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX1-NEXT: LBB61_12: ## %else10 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_14 +; AVX1-NEXT: ## BB#13: ## %cond.store11 +; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX1-NEXT: LBB61_14: ## %else12 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_16 +; AVX1-NEXT: ## BB#15: ## %cond.store13 +; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX1-NEXT: LBB61_16: ## %else14 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_18 +; AVX1-NEXT: ## BB#17: ## %cond.store15 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovd %xmm3, %eax +; AVX1-NEXT: movw %ax, 16(%rdi) +; AVX1-NEXT: LBB61_18: ## %else16 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_20 +; AVX1-NEXT: ## BB#19: ## %cond.store17 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrw $1, %xmm3, 18(%rdi) +; AVX1-NEXT: LBB61_20: ## %else18 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_22 +; AVX1-NEXT: ## BB#21: ## %cond.store19 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrw $2, %xmm3, 20(%rdi) +; AVX1-NEXT: LBB61_22: ## %else20 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_24 +; AVX1-NEXT: ## BB#23: ## %cond.store21 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrw $3, %xmm3, 22(%rdi) +; AVX1-NEXT: LBB61_24: ## %else22 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_26 +; AVX1-NEXT: ## BB#25: ## %cond.store23 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrw $4, %xmm3, 24(%rdi) +; AVX1-NEXT: LBB61_26: ## %else24 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_28 +; AVX1-NEXT: ## BB#27: ## %cond.store25 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrw $5, %xmm3, 26(%rdi) +; AVX1-NEXT: LBB61_28: ## %else26 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_30 +; AVX1-NEXT: ## BB#29: ## %cond.store27 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpextrw $6, %xmm3, 28(%rdi) +; AVX1-NEXT: LBB61_30: ## %else28 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_32 +; AVX1-NEXT: ## BB#31: ## %cond.store29 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrw $7, %xmm1, 30(%rdi) +; AVX1-NEXT: LBB61_32: ## %else30 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_34 +; AVX1-NEXT: ## BB#33: ## %cond.store31 +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: movw %ax, 32(%rdi) +; AVX1-NEXT: LBB61_34: ## %else32 +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_36 +; AVX1-NEXT: ## BB#35: ## %cond.store33 +; AVX1-NEXT: vpextrw $1, %xmm2, 34(%rdi) +; AVX1-NEXT: LBB61_36: ## %else34 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_38 +; AVX1-NEXT: ## BB#37: ## %cond.store35 +; AVX1-NEXT: vpextrw $2, %xmm2, 36(%rdi) +; AVX1-NEXT: LBB61_38: ## %else36 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_40 +; AVX1-NEXT: ## BB#39: ## %cond.store37 +; AVX1-NEXT: vpextrw $3, %xmm2, 38(%rdi) +; AVX1-NEXT: LBB61_40: ## %else38 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_42 +; AVX1-NEXT: ## BB#41: ## %cond.store39 +; AVX1-NEXT: vpextrw $4, %xmm2, 40(%rdi) +; AVX1-NEXT: LBB61_42: ## %else40 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_44 +; AVX1-NEXT: ## BB#43: ## %cond.store41 +; AVX1-NEXT: vpextrw $5, %xmm2, 42(%rdi) +; AVX1-NEXT: LBB61_44: ## %else42 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_46 +; AVX1-NEXT: ## BB#45: ## %cond.store43 +; AVX1-NEXT: vpextrw $6, %xmm2, 44(%rdi) +; AVX1-NEXT: LBB61_46: ## %else44 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_48 +; AVX1-NEXT: ## BB#47: ## %cond.store45 +; AVX1-NEXT: vpextrw $7, %xmm2, 46(%rdi) +; AVX1-NEXT: LBB61_48: ## %else46 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_50 +; AVX1-NEXT: ## BB#49: ## %cond.store47 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: movw %ax, 48(%rdi) +; AVX1-NEXT: LBB61_50: ## %else48 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_52 +; AVX1-NEXT: ## BB#51: ## %cond.store49 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm1, 50(%rdi) +; AVX1-NEXT: LBB61_52: ## %else50 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_54 +; AVX1-NEXT: ## BB#53: ## %cond.store51 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrw $2, %xmm1, 52(%rdi) +; AVX1-NEXT: LBB61_54: ## %else52 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_56 +; AVX1-NEXT: ## BB#55: ## %cond.store53 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrw $3, %xmm1, 54(%rdi) +; AVX1-NEXT: LBB61_56: ## %else54 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_58 +; AVX1-NEXT: ## BB#57: ## %cond.store55 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrw $4, %xmm1, 56(%rdi) +; AVX1-NEXT: LBB61_58: ## %else56 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_60 +; AVX1-NEXT: ## BB#59: ## %cond.store57 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrw $5, %xmm1, 58(%rdi) +; AVX1-NEXT: LBB61_60: ## %else58 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_62 +; AVX1-NEXT: ## BB#61: ## %cond.store59 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrw $6, %xmm1, 60(%rdi) +; AVX1-NEXT: LBB61_62: ## %else60 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: je LBB61_64 +; AVX1-NEXT: ## BB#63: ## %cond.store61 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vpextrw $7, %xmm0, 62(%rdi) +; AVX1-NEXT: LBB61_64: ## %else62 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mask_store_32xi16: +; AVX2: ## BB#0: +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_2 +; AVX2-NEXT: ## BB#1: ## %cond.store +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: movw %ax, (%rdi) +; AVX2-NEXT: LBB61_2: ## %else +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_4 +; AVX2-NEXT: ## BB#3: ## %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX2-NEXT: LBB61_4: ## %else2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_6 +; AVX2-NEXT: ## BB#5: ## %cond.store3 +; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX2-NEXT: LBB61_6: ## %else4 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_8 +; AVX2-NEXT: ## BB#7: ## %cond.store5 +; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX2-NEXT: LBB61_8: ## %else6 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_10 +; AVX2-NEXT: ## BB#9: ## %cond.store7 +; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX2-NEXT: LBB61_10: ## %else8 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_12 +; AVX2-NEXT: ## BB#11: ## %cond.store9 +; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX2-NEXT: LBB61_12: ## %else10 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_14 +; AVX2-NEXT: ## BB#13: ## %cond.store11 +; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX2-NEXT: LBB61_14: ## %else12 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_16 +; AVX2-NEXT: ## BB#15: ## %cond.store13 +; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX2-NEXT: LBB61_16: ## %else14 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_18 +; AVX2-NEXT: ## BB#17: ## %cond.store15 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vmovd %xmm3, %eax +; AVX2-NEXT: movw %ax, 16(%rdi) +; AVX2-NEXT: LBB61_18: ## %else16 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_20 +; AVX2-NEXT: ## BB#19: ## %cond.store17 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrw $1, %xmm3, 18(%rdi) +; AVX2-NEXT: LBB61_20: ## %else18 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_22 +; AVX2-NEXT: ## BB#21: ## %cond.store19 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrw $2, %xmm3, 20(%rdi) +; AVX2-NEXT: LBB61_22: ## %else20 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_24 +; AVX2-NEXT: ## BB#23: ## %cond.store21 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrw $3, %xmm3, 22(%rdi) +; AVX2-NEXT: LBB61_24: ## %else22 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_26 +; AVX2-NEXT: ## BB#25: ## %cond.store23 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrw $4, %xmm3, 24(%rdi) +; AVX2-NEXT: LBB61_26: ## %else24 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_28 +; AVX2-NEXT: ## BB#27: ## %cond.store25 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrw $5, %xmm3, 26(%rdi) +; AVX2-NEXT: LBB61_28: ## %else26 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_30 +; AVX2-NEXT: ## BB#29: ## %cond.store27 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpextrw $6, %xmm3, 28(%rdi) +; AVX2-NEXT: LBB61_30: ## %else28 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_32 +; AVX2-NEXT: ## BB#31: ## %cond.store29 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrw $7, %xmm1, 30(%rdi) +; AVX2-NEXT: LBB61_32: ## %else30 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_34 +; AVX2-NEXT: ## BB#33: ## %cond.store31 +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: movw %ax, 32(%rdi) +; AVX2-NEXT: LBB61_34: ## %else32 +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_36 +; AVX2-NEXT: ## BB#35: ## %cond.store33 +; AVX2-NEXT: vpextrw $1, %xmm2, 34(%rdi) +; AVX2-NEXT: LBB61_36: ## %else34 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_38 +; AVX2-NEXT: ## BB#37: ## %cond.store35 +; AVX2-NEXT: vpextrw $2, %xmm2, 36(%rdi) +; AVX2-NEXT: LBB61_38: ## %else36 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_40 +; AVX2-NEXT: ## BB#39: ## %cond.store37 +; AVX2-NEXT: vpextrw $3, %xmm2, 38(%rdi) +; AVX2-NEXT: LBB61_40: ## %else38 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_42 +; AVX2-NEXT: ## BB#41: ## %cond.store39 +; AVX2-NEXT: vpextrw $4, %xmm2, 40(%rdi) +; AVX2-NEXT: LBB61_42: ## %else40 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_44 +; AVX2-NEXT: ## BB#43: ## %cond.store41 +; AVX2-NEXT: vpextrw $5, %xmm2, 42(%rdi) +; AVX2-NEXT: LBB61_44: ## %else42 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_46 +; AVX2-NEXT: ## BB#45: ## %cond.store43 +; AVX2-NEXT: vpextrw $6, %xmm2, 44(%rdi) +; AVX2-NEXT: LBB61_46: ## %else44 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_48 +; AVX2-NEXT: ## BB#47: ## %cond.store45 +; AVX2-NEXT: vpextrw $7, %xmm2, 46(%rdi) +; AVX2-NEXT: LBB61_48: ## %else46 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_50 +; AVX2-NEXT: ## BB#49: ## %cond.store47 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: movw %ax, 48(%rdi) +; AVX2-NEXT: LBB61_50: ## %else48 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_52 +; AVX2-NEXT: ## BB#51: ## %cond.store49 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm1, 50(%rdi) +; AVX2-NEXT: LBB61_52: ## %else50 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_54 +; AVX2-NEXT: ## BB#53: ## %cond.store51 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrw $2, %xmm1, 52(%rdi) +; AVX2-NEXT: LBB61_54: ## %else52 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_56 +; AVX2-NEXT: ## BB#55: ## %cond.store53 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrw $3, %xmm1, 54(%rdi) +; AVX2-NEXT: LBB61_56: ## %else54 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_58 +; AVX2-NEXT: ## BB#57: ## %cond.store55 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrw $4, %xmm1, 56(%rdi) +; AVX2-NEXT: LBB61_58: ## %else56 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_60 +; AVX2-NEXT: ## BB#59: ## %cond.store57 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrw $5, %xmm1, 58(%rdi) +; AVX2-NEXT: LBB61_60: ## %else58 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_62 +; AVX2-NEXT: ## BB#61: ## %cond.store59 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrw $6, %xmm1, 60(%rdi) +; AVX2-NEXT: LBB61_62: ## %else60 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: je LBB61_64 +; AVX2-NEXT: ## BB#63: ## %cond.store61 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vpextrw $7, %xmm0, 62(%rdi) +; AVX2-NEXT: LBB61_64: ## %else62 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mask_store_32xi16: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpextrb $0, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_2 +; AVX512F-NEXT: ## BB#1: ## %cond.store +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: LBB61_2: ## %else +; AVX512F-NEXT: vpextrb $1, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_4 +; AVX512F-NEXT: ## BB#3: ## %cond.store1 +; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX512F-NEXT: LBB61_4: ## %else2 +; AVX512F-NEXT: vpextrb $2, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_6 +; AVX512F-NEXT: ## BB#5: ## %cond.store3 +; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX512F-NEXT: LBB61_6: ## %else4 +; AVX512F-NEXT: vpextrb $3, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_8 +; AVX512F-NEXT: ## BB#7: ## %cond.store5 +; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX512F-NEXT: LBB61_8: ## %else6 +; AVX512F-NEXT: vpextrb $4, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_10 +; AVX512F-NEXT: ## BB#9: ## %cond.store7 +; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX512F-NEXT: LBB61_10: ## %else8 +; AVX512F-NEXT: vpextrb $5, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_12 +; AVX512F-NEXT: ## BB#11: ## %cond.store9 +; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX512F-NEXT: LBB61_12: ## %else10 +; AVX512F-NEXT: vpextrb $6, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_14 +; AVX512F-NEXT: ## BB#13: ## %cond.store11 +; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX512F-NEXT: LBB61_14: ## %else12 +; AVX512F-NEXT: vpextrb $7, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_16 +; AVX512F-NEXT: ## BB#15: ## %cond.store13 +; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX512F-NEXT: LBB61_16: ## %else14 +; AVX512F-NEXT: vpextrb $8, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_18 +; AVX512F-NEXT: ## BB#17: ## %cond.store15 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: movw %ax, 16(%rdi) +; AVX512F-NEXT: LBB61_18: ## %else16 +; AVX512F-NEXT: vpextrb $9, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_20 +; AVX512F-NEXT: ## BB#19: ## %cond.store17 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpextrw $1, %xmm3, 18(%rdi) +; AVX512F-NEXT: LBB61_20: ## %else18 +; AVX512F-NEXT: vpextrb $10, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_22 +; AVX512F-NEXT: ## BB#21: ## %cond.store19 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpextrw $2, %xmm3, 20(%rdi) +; AVX512F-NEXT: LBB61_22: ## %else20 +; AVX512F-NEXT: vpextrb $11, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_24 +; AVX512F-NEXT: ## BB#23: ## %cond.store21 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpextrw $3, %xmm3, 22(%rdi) +; AVX512F-NEXT: LBB61_24: ## %else22 +; AVX512F-NEXT: vpextrb $12, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_26 +; AVX512F-NEXT: ## BB#25: ## %cond.store23 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpextrw $4, %xmm3, 24(%rdi) +; AVX512F-NEXT: LBB61_26: ## %else24 +; AVX512F-NEXT: vpextrb $13, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_28 +; AVX512F-NEXT: ## BB#27: ## %cond.store25 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpextrw $5, %xmm3, 26(%rdi) +; AVX512F-NEXT: LBB61_28: ## %else26 +; AVX512F-NEXT: vpextrb $14, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_30 +; AVX512F-NEXT: ## BB#29: ## %cond.store27 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpextrw $6, %xmm3, 28(%rdi) +; AVX512F-NEXT: LBB61_30: ## %else28 +; AVX512F-NEXT: vpextrb $15, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_32 +; AVX512F-NEXT: ## BB#31: ## %cond.store29 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vpextrw $7, %xmm1, 30(%rdi) +; AVX512F-NEXT: LBB61_32: ## %else30 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpextrb $0, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_34 +; AVX512F-NEXT: ## BB#33: ## %cond.store31 +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: movw %ax, 32(%rdi) +; AVX512F-NEXT: LBB61_34: ## %else32 +; AVX512F-NEXT: vpextrb $1, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_36 +; AVX512F-NEXT: ## BB#35: ## %cond.store33 +; AVX512F-NEXT: vpextrw $1, %xmm2, 34(%rdi) +; AVX512F-NEXT: LBB61_36: ## %else34 +; AVX512F-NEXT: vpextrb $2, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_38 +; AVX512F-NEXT: ## BB#37: ## %cond.store35 +; AVX512F-NEXT: vpextrw $2, %xmm2, 36(%rdi) +; AVX512F-NEXT: LBB61_38: ## %else36 +; AVX512F-NEXT: vpextrb $3, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_40 +; AVX512F-NEXT: ## BB#39: ## %cond.store37 +; AVX512F-NEXT: vpextrw $3, %xmm2, 38(%rdi) +; AVX512F-NEXT: LBB61_40: ## %else38 +; AVX512F-NEXT: vpextrb $4, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_42 +; AVX512F-NEXT: ## BB#41: ## %cond.store39 +; AVX512F-NEXT: vpextrw $4, %xmm2, 40(%rdi) +; AVX512F-NEXT: LBB61_42: ## %else40 +; AVX512F-NEXT: vpextrb $5, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_44 +; AVX512F-NEXT: ## BB#43: ## %cond.store41 +; AVX512F-NEXT: vpextrw $5, %xmm2, 42(%rdi) +; AVX512F-NEXT: LBB61_44: ## %else42 +; AVX512F-NEXT: vpextrb $6, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_46 +; AVX512F-NEXT: ## BB#45: ## %cond.store43 +; AVX512F-NEXT: vpextrw $6, %xmm2, 44(%rdi) +; AVX512F-NEXT: LBB61_46: ## %else44 +; AVX512F-NEXT: vpextrb $7, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_48 +; AVX512F-NEXT: ## BB#47: ## %cond.store45 +; AVX512F-NEXT: vpextrw $7, %xmm2, 46(%rdi) +; AVX512F-NEXT: LBB61_48: ## %else46 +; AVX512F-NEXT: vpextrb $8, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_50 +; AVX512F-NEXT: ## BB#49: ## %cond.store47 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: movw %ax, 48(%rdi) +; AVX512F-NEXT: LBB61_50: ## %else48 +; AVX512F-NEXT: vpextrb $9, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_52 +; AVX512F-NEXT: ## BB#51: ## %cond.store49 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vpextrw $1, %xmm1, 50(%rdi) +; AVX512F-NEXT: LBB61_52: ## %else50 +; AVX512F-NEXT: vpextrb $10, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_54 +; AVX512F-NEXT: ## BB#53: ## %cond.store51 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vpextrw $2, %xmm1, 52(%rdi) +; AVX512F-NEXT: LBB61_54: ## %else52 +; AVX512F-NEXT: vpextrb $11, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_56 +; AVX512F-NEXT: ## BB#55: ## %cond.store53 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vpextrw $3, %xmm1, 54(%rdi) +; AVX512F-NEXT: LBB61_56: ## %else54 +; AVX512F-NEXT: vpextrb $12, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_58 +; AVX512F-NEXT: ## BB#57: ## %cond.store55 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vpextrw $4, %xmm1, 56(%rdi) +; AVX512F-NEXT: LBB61_58: ## %else56 +; AVX512F-NEXT: vpextrb $13, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_60 +; AVX512F-NEXT: ## BB#59: ## %cond.store57 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vpextrw $5, %xmm1, 58(%rdi) +; AVX512F-NEXT: LBB61_60: ## %else58 +; AVX512F-NEXT: vpextrb $14, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_62 +; AVX512F-NEXT: ## BB#61: ## %cond.store59 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512F-NEXT: vpextrw $6, %xmm1, 60(%rdi) +; AVX512F-NEXT: LBB61_62: ## %else60 +; AVX512F-NEXT: vpextrb $15, %xmm0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je LBB61_64 +; AVX512F-NEXT: ## BB#63: ## %cond.store61 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512F-NEXT: vpextrw $7, %xmm0, 62(%rdi) +; AVX512F-NEXT: LBB61_64: ## %else62 +; AVX512F-NEXT: retq +; ; SKX-LABEL: test_mask_store_32xi16: ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/materialize.ll b/llvm/test/CodeGen/X86/materialize.ll index 524f954523d9..6e1264b4fd43 100644 --- a/llvm/test/CodeGen/X86/materialize.ll +++ b/llvm/test/CodeGen/X86/materialize.ll @@ -125,6 +125,7 @@ entry: ; CHECK32-LABEL: one16: ; CHECK32: xorl %eax, %eax ; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: # kill ; CHECK32-NEXT: retl } @@ -135,6 +136,7 @@ entry: ; CHECK32-LABEL: minus_one16: ; CHECK32: xorl %eax, %eax ; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: # kill ; CHECK32-NEXT: retl } diff --git a/llvm/test/CodeGen/X86/movmsk.ll b/llvm/test/CodeGen/X86/movmsk.ll index ee2e7182c8c8..1caa22a15947 100644 --- a/llvm/test/CodeGen/X86/movmsk.ll +++ b/llvm/test/CodeGen/X86/movmsk.ll @@ -102,6 +102,7 @@ define void @float_call_signbit(double %n) { ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: movd %xmm0, %rdi ; CHECK-NEXT: shrq $63, %rdi +; CHECK-NEXT: ## kill: %EDI %EDI %RDI ; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL entry: %t0 = bitcast double %n to i64 diff --git a/llvm/test/CodeGen/X86/or-lea.ll b/llvm/test/CodeGen/X86/or-lea.ll index f45a639ffa2c..e65056a91c43 100644 --- a/llvm/test/CodeGen/X86/or-lea.ll +++ b/llvm/test/CodeGen/X86/or-lea.ll @@ -9,6 +9,8 @@ define i32 @or_shift1_and1(i32 %x, i32 %y) { ; CHECK-LABEL: or_shift1_and1: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: leal (%rsi,%rdi,2), %eax ; CHECK-NEXT: retq @@ -22,6 +24,8 @@ define i32 @or_shift1_and1(i32 %x, i32 %y) { define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) { ; CHECK-LABEL: or_shift1_and1_swapped: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: leal (%rsi,%rdi,2), %eax ; CHECK-NEXT: retq @@ -35,6 +39,8 @@ define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) { define i32 @or_shift2_and1(i32 %x, i32 %y) { ; CHECK-LABEL: or_shift2_and1: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: leal (%rsi,%rdi,4), %eax ; CHECK-NEXT: retq @@ -48,6 +54,8 @@ define i32 @or_shift2_and1(i32 %x, i32 %y) { define i32 @or_shift3_and1(i32 %x, i32 %y) { ; CHECK-LABEL: or_shift3_and1: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: leal (%rsi,%rdi,8), %eax ; CHECK-NEXT: retq @@ -61,6 +69,8 @@ define i32 @or_shift3_and1(i32 %x, i32 %y) { define i32 @or_shift3_and7(i32 %x, i32 %y) { ; CHECK-LABEL: or_shift3_and7: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: andl $7, %esi ; CHECK-NEXT: leal (%rsi,%rdi,8), %eax ; CHECK-NEXT: retq @@ -76,6 +86,8 @@ define i32 @or_shift3_and7(i32 %x, i32 %y) { define i32 @or_shift4_and1(i32 %x, i32 %y) { ; CHECK-LABEL: or_shift4_and1: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: shll $4, %edi ; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: leal (%rsi,%rdi), %eax @@ -92,6 +104,7 @@ define i32 @or_shift4_and1(i32 %x, i32 %y) { define i32 @or_shift3_and8(i32 %x, i32 %y) { ; CHECK-LABEL: or_shift3_and8: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: leal (,%rdi,8), %eax ; CHECK-NEXT: andl $8, %esi ; CHECK-NEXT: orl %esi, %eax diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 657d51c05bd2..5893b5c304c3 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -66,6 +66,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq entry: %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > @@ -206,6 +207,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq entry: %A = mul <16 x i8> %i, %j diff --git a/llvm/test/CodeGen/X86/pr28173.ll b/llvm/test/CodeGen/X86/pr28173.ll index 7c20d0857d90..31ea4ffb5616 100644 --- a/llvm/test/CodeGen/X86/pr28173.ll +++ b/llvm/test/CodeGen/X86/pr28173.ll @@ -8,6 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" define i64 @foo64(i1 zeroext %i, i32 %j) #0 { ; CHECK-LABEL: foo64: ; CHECK: # BB#0: +; CHECK-NEXT: # kill ; CHECK-NEXT: orq $-2, %rdi ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/promote-i16.ll b/llvm/test/CodeGen/X86/promote-i16.ll index 484dbf0fbe16..7eb367480d76 100644 --- a/llvm/test/CodeGen/X86/promote-i16.ll +++ b/llvm/test/CodeGen/X86/promote-i16.ll @@ -5,6 +5,7 @@ entry: ; CHECK-LABEL: foo: ; CHECK: movzwl 4(%esp), %eax ; CHECK-NEXT: xorl $21998, %eax +; CHECK-NEXT: # kill ; CHECK-NEXT: retl %0 = xor i16 %x, 21998 ret i16 %0 @@ -15,6 +16,7 @@ entry: ; CHECK-LABEL: bar: ; CHECK: movzwl 4(%esp), %eax ; CHECK-NEXT: xorl $54766, %eax +; CHECK-NEXT: # kill ; CHECK-NEXT: retl %0 = xor i16 %x, 54766 ret i16 %0 diff --git a/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll index 035291c0d8ad..a264adffe790 100644 --- a/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll @@ -28,6 +28,7 @@ define i32 @test__blcfill_u32(i32 %a0) { ; ; X64-LABEL: test__blcfill_u32: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal 1(%rdi), %eax ; X64-NEXT: andl %edi, %eax ; X64-NEXT: retq @@ -47,6 +48,7 @@ define i32 @test__blci_u32(i32 %a0) { ; ; X64-LABEL: test__blci_u32: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal 1(%rdi), %eax ; X64-NEXT: xorl $-1, %eax ; X64-NEXT: orl %edi, %eax @@ -91,6 +93,7 @@ define i32 @test__blcmsk_u32(i32 %a0) { ; ; X64-LABEL: test__blcmsk_u32: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal 1(%rdi), %eax ; X64-NEXT: xorl %edi, %eax ; X64-NEXT: retq @@ -109,6 +112,7 @@ define i32 @test__blcs_u32(i32 %a0) { ; ; X64-LABEL: test__blcs_u32: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal 1(%rdi), %eax ; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-i8-constant.ll b/llvm/test/CodeGen/X86/urem-i8-constant.ll index 03d73eede8c9..45717f985c23 100644 --- a/llvm/test/CodeGen/X86/urem-i8-constant.ll +++ b/llvm/test/CodeGen/X86/urem-i8-constant.ll @@ -9,6 +9,7 @@ define i8 @foo(i8 %tmp325) { ; CHECK-NEXT: andl $28672, %eax # imm = 0x7000 ; CHECK-NEXT: shrl $12, %eax ; CHECK-NEXT: movb $37, %dl +; CHECK-NEXT: # kill: %AL %AL %EAX ; CHECK-NEXT: mulb %dl ; CHECK-NEXT: subb %al, %cl ; CHECK-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/urem-power-of-two.ll b/llvm/test/CodeGen/X86/urem-power-of-two.ll index 5fb4bd901a2e..9e27809c297d 100644 --- a/llvm/test/CodeGen/X86/urem-power-of-two.ll +++ b/llvm/test/CodeGen/X86/urem-power-of-two.ll @@ -57,8 +57,10 @@ define i8 @and_pow_2(i8 %x, i8 %y) { ; CHECK: # BB#0: ; CHECK-NEXT: andb $4, %sil ; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: # kill: %EAX %EAX %AX ; CHECK-NEXT: divb %sil ; CHECK-NEXT: movzbl %ah, %eax # NOREX +; CHECK-NEXT: # kill: %AL %AL %EAX ; CHECK-NEXT: retq ; %and = and i8 %y, 4 @@ -66,7 +68,7 @@ define i8 @and_pow_2(i8 %x, i8 %y) { ret i8 %urem } -; A vector splat constant divisor should get the same treatment as a scalar. +; A vector splat constant divisor should get the same treatment as a scalar. define <4 x i32> @vec_const_pow_2(<4 x i32> %x) { ; CHECK-LABEL: vec_const_pow_2: diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 7834b2804247..0ad5ef7ee8f5 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -81,6 +81,7 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { ; ; AVX-LABEL: fptosi_4f64_to_2i32: ; AVX: # BB#0: +; AVX-NEXT: # kill ; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll index 1d6c785bafe4..67875b3ef23e 100644 --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -17,6 +17,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind { ; ; X64-LABEL: t1: ; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: shll $12, %edi ; X64-NEXT: movd %rdi, %xmm0 ; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll index 64c014784f4a..2aae35591ab2 100644 --- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll @@ -16,6 +16,7 @@ define x86_mmx @t0(i32 %A) nounwind { ; ; X64-LABEL: t0: ; X64: ## BB#0: +; X64-NEXT: ## kill: %EDI %EDI %RDI ; X64-NEXT: movd %rdi, %xmm0 ; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 6cb4b180b2e0..43f5318a6070 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -61,6 +61,7 @@ define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { ; AVX-LABEL: sitofp_4i32_to_2f64: ; AVX: # BB#0: ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: # kill ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %cvt = sitofp <4 x i32> %a to <4 x double> @@ -98,6 +99,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -105,6 +107,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = sitofp <8 x i16> %a to <8 x double> @@ -144,6 +147,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -152,6 +156,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = sitofp <16 x i8> %a to <16 x double> @@ -432,6 +437,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -445,6 +451,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = uitofp <4 x i32> %a to <4 x double> @@ -482,6 +489,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -489,6 +497,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = uitofp <8 x i16> %a to <8 x double> @@ -528,6 +537,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -536,6 +546,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = uitofp <16 x i8> %a to <16 x double> @@ -890,6 +901,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -897,6 +909,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = sitofp <8 x i16> %a to <8 x float> @@ -939,6 +952,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -947,6 +961,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = sitofp <16 x i8> %a to <16 x float> @@ -1384,6 +1399,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1391,6 +1407,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = uitofp <8 x i16> %a to <8 x float> @@ -1433,6 +1450,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1441,6 +1459,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cvt = uitofp <16 x i8> %a to <16 x float> diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll index 5b298d8c6eb1..076f748009b3 100644 --- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll +++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll @@ -13,18 +13,20 @@ define i16 @test1(float %f) nounwind { ; CHECK-NEXT: minss LCPI0_2, %xmm0 ; CHECK-NEXT: maxss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retl - %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] - %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] - %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] - %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] - %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] - %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1] - %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; [#uses=1] - %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; [#uses=1] - ret i16 %tmp69 +; + %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] + %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] + %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] + %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] + %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] + %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] + %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] + %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1] + %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; [#uses=1] + %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; [#uses=1] + ret i16 %tmp69 } define i16 @test2(float %f) nounwind { @@ -37,15 +39,17 @@ define i16 @test2(float %f) nounwind { ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: maxss %xmm1, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %eax +; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retl - %tmp28 = fsub float %f, 1.000000e+00 ; [#uses=1] - %tmp37 = fmul float %tmp28, 5.000000e-01 ; [#uses=1] - %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0 ; <<4 x float>> [#uses=1] - %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] - %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] - %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; [#uses=1] - %tmp69 = trunc i32 %tmp to i16 ; [#uses=1] - ret i16 %tmp69 +; + %tmp28 = fsub float %f, 1.000000e+00 ; [#uses=1] + %tmp37 = fmul float %tmp28, 5.000000e-01 ; [#uses=1] + %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0 ; <<4 x float>> [#uses=1] + %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] + %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] + %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; [#uses=1] + %tmp69 = trunc i32 %tmp to i16 ; [#uses=1] + ret i16 %tmp69 } declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) @@ -68,6 +72,7 @@ define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: roundss $4, (%eax), %xmm0 ; CHECK-NEXT: retl +; %a = load float , float *%b %B = insertelement <4 x float> undef, float %a, i32 0 %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4) @@ -86,6 +91,7 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind { ; CHECK-NEXT: roundss $4, %xmm1, %xmm0 ; CHECK-NEXT: addl $28, %esp ; CHECK-NEXT: retl +; %a = load float , float *%b %B = insertelement <4 x float> undef, float %a, i32 0 %q = call <4 x float> @f() @@ -101,6 +107,7 @@ define <2 x double> @test5() nounwind uwtable readnone noinline { ; CHECK-NEXT: movl $128, %eax ; CHECK-NEXT: cvtsi2sdl %eax, %xmm0 ; CHECK-NEXT: retl +; entry: %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> , i32 128) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll index dfc186bef052..c0e02bd15996 100644 --- a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll @@ -65,7 +65,9 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) { ; ; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32: ; AVX512F: # BB#0: +; AVX512F-NEXT: # kill ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32: @@ -142,7 +144,9 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) { ; ; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32: ; AVX512F: # BB#0: +; AVX512F-NEXT: # kill ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32: diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index 2c8d01326bc1..25e1f68679da 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -80,6 +80,7 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; XOP-NEXT: vmovd %edi, %xmm0 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $0, %xmm0, %eax +; XOP-NEXT: # kill: %AL %AL %EAX ; XOP-NEXT: retq %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b @@ -88,6 +89,7 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { define i16 @test_bitreverse_i16(i16 %a) nounwind { ; SSE-LABEL: test_bitreverse_i16: ; SSE: # BB#0: +; SSE-NEXT: # kill: %EDI %EDI %RDI ; SSE-NEXT: movl %edi, %ecx ; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 ; SSE-NEXT: movl %edi, %eax @@ -148,10 +150,12 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; SSE-NEXT: shrl $15, %ecx ; SSE-NEXT: orl %edi, %ecx ; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: # kill: %AX %AX %EAX ; SSE-NEXT: retq ; ; AVX-LABEL: test_bitreverse_i16: ; AVX: # BB#0: +; AVX-NEXT: # kill: %EDI %EDI %RDI ; AVX-NEXT: movl %edi, %ecx ; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 ; AVX-NEXT: movl %edi, %eax @@ -212,6 +216,7 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; AVX-NEXT: shrl $15, %ecx ; AVX-NEXT: orl %edi, %ecx ; AVX-NEXT: orl %ecx, %eax +; AVX-NEXT: # kill: %AX %AX %EAX ; AVX-NEXT: retq ; ; XOP-LABEL: test_bitreverse_i16: @@ -219,6 +224,7 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; XOP-NEXT: vmovd %edi, %xmm0 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vmovd %xmm0, %eax +; XOP-NEXT: # kill: %AX %AX %EAX ; XOP-NEXT: retq %b = call i16 @llvm.bitreverse.i16(i16 %a) ret i16 %b @@ -227,6 +233,7 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { define i32 @test_bitreverse_i32(i32 %a) nounwind { ; SSE-LABEL: test_bitreverse_i32: ; SSE: # BB#0: +; SSE-NEXT: # kill: %EDI %EDI %RDI ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: shll $31, %eax ; SSE-NEXT: movl %edi, %ecx @@ -353,6 +360,7 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind { ; ; AVX-LABEL: test_bitreverse_i32: ; AVX: # BB#0: +; AVX-NEXT: # kill: %EDI %EDI %RDI ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: shll $31, %eax ; AVX-NEXT: movl %edi, %ecx diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index 6c93f69ba614..16bf596f3bbf 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -148,6 +148,7 @@ define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind { ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -155,6 +156,7 @@ define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = fcmp ogt <4 x double> %a0, %a1 ret <4 x i1> %1 @@ -200,6 +202,7 @@ define <8 x i1> @test_cmp_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind { ; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -207,6 +210,7 @@ define <8 x i1> @test_cmp_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = fcmp ogt <8 x float> %a0, %a1 ret <8 x i1> %1 @@ -267,6 +271,7 @@ define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -274,6 +279,7 @@ define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = icmp sgt <4 x i64> %a0, %a1 ret <4 x i1> %1 @@ -319,6 +325,7 @@ define <8 x i1> @test_cmp_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -326,6 +333,7 @@ define <8 x i1> @test_cmp_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = icmp sgt <8 x i32> %a0, %a1 ret <8 x i1> %1 @@ -691,6 +699,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -874,6 +883,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 591e1f2f0e27..b091d1bca2ef 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -27,6 +27,7 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) { ; ALL-NEXT: movq %rax, %rcx ; ALL-NEXT: movq %rax, %rdx ; ALL-NEXT: movswl %ax, %esi +; ALL-NEXT: # kill: %EAX %EAX %RAX ; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx ; ALL-NEXT: shrq $48, %rdx @@ -57,6 +58,7 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) { ; ALL-NEXT: movq %rax, %rcx ; ALL-NEXT: movq %rax, %rdx ; ALL-NEXT: movswl %ax, %esi +; ALL-NEXT: # kill: %EAX %EAX %RAX ; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx ; ALL-NEXT: shrq $48, %rdx @@ -88,6 +90,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { ; AVX1-NEXT: movq %rdx, %r8 ; AVX1-NEXT: movq %rdx, %r10 ; AVX1-NEXT: movswl %dx, %r9d +; AVX1-NEXT: # kill: %EDX %EDX %RDX ; AVX1-NEXT: shrl $16, %edx ; AVX1-NEXT: shrq $32, %r8 ; AVX1-NEXT: shrq $48, %r10 @@ -95,6 +98,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { ; AVX1-NEXT: movq %rdi, %rax ; AVX1-NEXT: movq %rdi, %rsi ; AVX1-NEXT: movswl %di, %ecx +; AVX1-NEXT: # kill: %EDI %EDI %RDI ; AVX1-NEXT: shrl $16, %edi ; AVX1-NEXT: shrq $32, %rax ; AVX1-NEXT: shrq $48, %rsi @@ -135,6 +139,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { ; AVX2-NEXT: movq %rdx, %r8 ; AVX2-NEXT: movq %rdx, %r10 ; AVX2-NEXT: movswl %dx, %r9d +; AVX2-NEXT: # kill: %EDX %EDX %RDX ; AVX2-NEXT: shrl $16, %edx ; AVX2-NEXT: shrq $32, %r8 ; AVX2-NEXT: shrq $48, %r10 @@ -142,6 +147,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { ; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: movq %rdi, %rsi ; AVX2-NEXT: movswl %di, %ecx +; AVX2-NEXT: # kill: %EDI %EDI %RDI ; AVX2-NEXT: shrl $16, %edi ; AVX2-NEXT: shrq $32, %rax ; AVX2-NEXT: shrq $48, %rsi @@ -182,6 +188,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { ; AVX512-NEXT: movq %rdx, %r8 ; AVX512-NEXT: movq %rdx, %r10 ; AVX512-NEXT: movswl %dx, %r9d +; AVX512-NEXT: # kill: %EDX %EDX %RDX ; AVX512-NEXT: shrl $16, %edx ; AVX512-NEXT: shrq $32, %r8 ; AVX512-NEXT: shrq $48, %r10 @@ -189,6 +196,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) { ; AVX512-NEXT: movq %rdi, %rax ; AVX512-NEXT: movq %rdi, %rsi ; AVX512-NEXT: movswl %di, %ecx +; AVX512-NEXT: # kill: %EDI %EDI %RDI ; AVX512-NEXT: shrl $16, %edi ; AVX512-NEXT: shrq $32, %rax ; AVX512-NEXT: shrq $48, %rsi @@ -241,6 +249,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX1-NEXT: movswl %cx, %ecx ; AVX1-NEXT: vmovd %ecx, %xmm9 ; AVX1-NEXT: movswl %ax, %ecx +; AVX1-NEXT: # kill: %EAX %EAX %RAX ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm10 @@ -255,6 +264,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX1-NEXT: movswl %cx, %ecx ; AVX1-NEXT: vmovd %ecx, %xmm13 ; AVX1-NEXT: movswl %ax, %ecx +; AVX1-NEXT: # kill: %EAX %EAX %RAX ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm14 @@ -269,6 +279,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX1-NEXT: movswl %cx, %ecx ; AVX1-NEXT: vmovd %ecx, %xmm3 ; AVX1-NEXT: movswl %ax, %ecx +; AVX1-NEXT: # kill: %EAX %EAX %RAX ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm4 @@ -333,6 +344,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX2-NEXT: movswl %cx, %ecx ; AVX2-NEXT: vmovd %ecx, %xmm9 ; AVX2-NEXT: movswl %ax, %ecx +; AVX2-NEXT: # kill: %EAX %EAX %RAX ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm10 @@ -347,6 +359,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX2-NEXT: movswl %cx, %ecx ; AVX2-NEXT: vmovd %ecx, %xmm13 ; AVX2-NEXT: movswl %ax, %ecx +; AVX2-NEXT: # kill: %EAX %EAX %RAX ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm14 @@ -361,6 +374,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX2-NEXT: movswl %cx, %ecx ; AVX2-NEXT: vmovd %ecx, %xmm3 ; AVX2-NEXT: movswl %ax, %ecx +; AVX2-NEXT: # kill: %EAX %EAX %RAX ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm4 @@ -425,6 +439,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX512-NEXT: movswl %cx, %ecx ; AVX512-NEXT: vmovd %ecx, %xmm9 ; AVX512-NEXT: movswl %ax, %ecx +; AVX512-NEXT: # kill: %EAX %EAX %RAX ; AVX512-NEXT: shrl $16, %eax ; AVX512-NEXT: cwtl ; AVX512-NEXT: vmovd %eax, %xmm11 @@ -439,6 +454,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX512-NEXT: movswl %cx, %ecx ; AVX512-NEXT: vmovd %ecx, %xmm14 ; AVX512-NEXT: movswl %ax, %ecx +; AVX512-NEXT: # kill: %EAX %EAX %RAX ; AVX512-NEXT: shrl $16, %eax ; AVX512-NEXT: cwtl ; AVX512-NEXT: vmovd %eax, %xmm15 @@ -453,6 +469,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) { ; AVX512-NEXT: movswl %cx, %ecx ; AVX512-NEXT: vmovd %ecx, %xmm1 ; AVX512-NEXT: movswl %ax, %ecx +; AVX512-NEXT: # kill: %EAX %EAX %RAX ; AVX512-NEXT: shrl $16, %eax ; AVX512-NEXT: cwtl ; AVX512-NEXT: vmovd %eax, %xmm4 @@ -558,6 +575,7 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) { ; ALL-NEXT: movq %rax, %rcx ; ALL-NEXT: movq %rax, %rdx ; ALL-NEXT: movswl %ax, %esi +; ALL-NEXT: # kill: %EAX %EAX %RAX ; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx ; ALL-NEXT: shrq $48, %rdx @@ -1441,6 +1459,7 @@ define i16 @cvt_f32_to_i16(float %a0) { ; ALL: # BB#0: ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: # kill: %AX %AX %EAX ; ALL-NEXT: retq %1 = fptrunc float %a0 to half %2 = bitcast half %1 to i16 @@ -2383,6 +2402,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) { ; AVX1-NEXT: movw %ax, %bx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r14d @@ -2429,6 +2449,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) { ; AVX2-NEXT: movw %ax, %bx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %r14d @@ -2474,6 +2495,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) { ; AVX512-NEXT: movw %ax, %bx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r14d ; AVX512-NEXT: orl %ebx, %r14d @@ -2523,6 +2545,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) { ; AVX1-NEXT: movw %ax, %bx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r14d @@ -2570,6 +2593,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) { ; AVX2-NEXT: movw %ax, %bx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %r14d @@ -2616,6 +2640,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) { ; AVX512-NEXT: movw %ax, %bx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r14d ; AVX512-NEXT: orl %ebx, %r14d @@ -2667,6 +2692,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) { ; AVX1-NEXT: movw %ax, %bx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r14d @@ -2714,6 +2740,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) { ; AVX2-NEXT: movw %ax, %bx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %r14d @@ -2760,6 +2787,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) { ; AVX512-NEXT: movw %ax, %bx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r14d ; AVX512-NEXT: orl %ebx, %r14d @@ -2817,6 +2845,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX1-NEXT: movw %ax, %bx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r15d @@ -2842,6 +2871,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX1-NEXT: movw %ax, %bx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r15d @@ -2897,6 +2927,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX2-NEXT: movw %ax, %bx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %r15d @@ -2922,6 +2953,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX2-NEXT: movw %ax, %bx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %r15d @@ -2975,6 +3007,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX512-NEXT: movw %ax, %bx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r15d ; AVX512-NEXT: orl %ebx, %r15d @@ -2999,6 +3032,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX512-NEXT: movw %ax, %bx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r15d ; AVX512-NEXT: orl %ebx, %r15d @@ -3126,6 +3160,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) { ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r15d ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebp @@ -3181,6 +3216,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) { ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r15d ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebp @@ -3234,6 +3270,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) { ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r15d ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebp ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload @@ -3283,6 +3320,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) { ; AVX1-NEXT: movw %ax, %bp ; AVX1-NEXT: shll $16, %ebp ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %ebx @@ -3338,6 +3376,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) { ; AVX2-NEXT: movw %ax, %bp ; AVX2-NEXT: shll $16, %ebp ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %ebx @@ -3392,6 +3431,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) { ; AVX512-NEXT: movw %ax, %bp ; AVX512-NEXT: shll $16, %ebp ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %ebx ; AVX512-NEXT: orl %ebp, %ebx @@ -3452,6 +3492,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) { ; AVX1-NEXT: movw %ax, %bp ; AVX1-NEXT: shll $16, %ebp ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %ebx @@ -3507,6 +3548,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) { ; AVX2-NEXT: movw %ax, %bp ; AVX2-NEXT: shll $16, %ebp ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %ebx @@ -3561,6 +3603,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) { ; AVX512-NEXT: movw %ax, %bp ; AVX512-NEXT: shll $16, %ebp ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %ebx ; AVX512-NEXT: orl %ebp, %ebx @@ -3655,6 +3698,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r13d ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebp @@ -3662,6 +3706,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r14d ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r15d @@ -3748,6 +3793,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r13d ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebp @@ -3755,6 +3801,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r14d ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r15d @@ -3838,12 +3885,14 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r13d ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebp ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r14d ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r15d ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 7aadeaa771e9..1bb7181d31df 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -1494,6 +1494,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %cl, %al ; AVX512BW-NEXT: movb $7, %dil +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %dl ; AVX512BW-NEXT: movzbl %dl, %edx @@ -1506,6 +1507,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %cl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %cl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %sil ; AVX512BW-NEXT: movzbl %sil, %eax @@ -1520,6 +1522,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1533,6 +1536,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1546,6 +1550,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1559,6 +1564,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1572,6 +1578,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1585,6 +1592,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1598,6 +1606,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1611,6 +1620,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1624,6 +1634,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1637,6 +1648,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1650,6 +1662,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1663,6 +1676,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1676,6 +1690,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1689,6 +1704,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1703,6 +1719,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %esi @@ -1715,6 +1732,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %cl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %cl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %dl ; AVX512BW-NEXT: movzbl %dl, %eax @@ -1729,6 +1747,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1742,6 +1761,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1755,6 +1775,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1768,6 +1789,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1781,6 +1803,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1794,6 +1817,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1807,6 +1831,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1820,6 +1845,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1833,6 +1859,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1846,6 +1873,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1859,6 +1887,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1872,6 +1901,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1885,6 +1915,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1898,6 +1929,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1913,6 +1945,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %esi @@ -1925,6 +1958,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %cl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %cl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %dl ; AVX512BW-NEXT: movzbl %dl, %eax @@ -1939,6 +1973,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1952,6 +1987,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1965,6 +2001,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1978,6 +2015,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -1991,6 +2029,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2004,6 +2043,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2017,6 +2057,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2030,6 +2071,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2043,6 +2085,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2056,6 +2099,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2069,6 +2113,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2082,6 +2127,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2095,6 +2141,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2108,6 +2155,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2121,6 +2169,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %esi @@ -2133,6 +2182,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %cl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %cl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %dl ; AVX512BW-NEXT: movzbl %dl, %eax @@ -2147,6 +2197,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2160,6 +2211,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2173,6 +2225,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2186,6 +2239,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2199,6 +2253,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2212,6 +2267,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2225,6 +2281,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2238,6 +2295,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2251,6 +2309,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2264,6 +2323,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2277,6 +2337,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2290,6 +2351,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2303,6 +2365,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax @@ -2316,6 +2379,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: shrb $7, %dl ; AVX512BW-NEXT: sarb $2, %al ; AVX512BW-NEXT: addb %dl, %al +; AVX512BW-NEXT: # kill: %AL %AL %EAX ; AVX512BW-NEXT: mulb %dil ; AVX512BW-NEXT: subb %al, %cl ; AVX512BW-NEXT: movzbl %cl, %eax diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll index 9220cc800992..06c785575339 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -105,7 +105,9 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; AVX512CD-LABEL: testv2i64: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv2i64: @@ -221,7 +223,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX512CD-LABEL: testv2i64u: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv2i64u: @@ -408,7 +412,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX512CD-LABEL: testv4i32: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv4i32: @@ -571,7 +577,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX512CD-LABEL: testv4i32u: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv4i32u: diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll index fd48d484f98e..ed31e49cb07c 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -70,7 +70,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX512CD-LABEL: testv4i64: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0) @@ -138,7 +140,9 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512CD-LABEL: testv4i64u: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1) @@ -220,7 +224,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512CD-LABEL: testv8i32: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0) @@ -293,7 +299,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512CD-LABEL: testv8i32u: ; AVX512CD: ## BB#0: +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1) diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 3960c445daa7..e3daba5eb166 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -786,6 +786,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512-NEXT: movzbl (%rdi), %eax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_2i1_to_2i64: @@ -967,6 +968,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_4i1_to_4i32: @@ -1161,6 +1163,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512-NEXT: movzbl (%rdi), %eax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_4i1_to_4i64: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 7fc7ea2f73b6..81eaeb998075 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -311,6 +311,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -323,7 +324,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512-LABEL: var_shift_v8i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: @@ -1218,6 +1222,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1230,8 +1235,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v8i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 104f0ffe7cb8..af076fbbd818 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -214,7 +214,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; ; AVX512-LABEL: var_shift_v16i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift @@ -818,8 +821,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v16i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = ashr <16 x i16> %a, ret <16 x i16> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 09229b440e3a..213e2a41a662 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -280,6 +280,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -292,7 +293,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512-LABEL: var_shift_v8i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: @@ -952,6 +956,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -964,8 +969,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v8i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 06a724d4a9d9..f9ff3092388b 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -191,7 +191,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; ; AVX512-LABEL: var_shift_v16i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift @@ -673,8 +676,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v16i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = lshr <16 x i16> %a, ret <16 x i16> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index bf1e947ad7da..7202f1ec0cb8 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -237,6 +237,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -247,7 +248,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512-LABEL: var_shift_v8i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: @@ -837,8 +841,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v8i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 88ef533272eb..bc7d20cd86d8 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -166,7 +166,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; ; AVX512-LABEL: var_shift_v16i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift @@ -585,8 +588,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; ; AVX512-LABEL: constant_shift_v16i16: ; AVX512: ## BB#0: +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = shl <16 x i16> %a, ret <16 x i16> %shift diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 46e38a2c2c40..181b2e420203 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1223,12 +1223,14 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; AVX1-LABEL: insert_reg_and_zero_v4f64: ; AVX1: # BB#0: +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_reg_and_zero_v4f64: ; AVX2: # BB#0: +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 82fbcf45fb5e..a76c4a76a16b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -73,6 +73,7 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) { define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) { ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> @@ -96,6 +97,7 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) { define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) { ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> @@ -121,6 +123,7 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) { define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) { ; CHECK-LABEL: combine_permd_as_vpbroadcastd256: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq @@ -142,6 +145,7 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) { define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) { ; CHECK-LABEL: combine_permd_as_vpbroadcastq256: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq @@ -165,6 +169,7 @@ define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) { define <8 x float> @combine_permd_as_vpbroadcastss256(<4 x float> %a) { ; CHECK-LABEL: combine_permd_as_vpbroadcastss256: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> @@ -175,6 +180,7 @@ define <8 x float> @combine_permd_as_vpbroadcastss256(<4 x float> %a) { define <4 x double> @combine_permd_as_vpbroadcastsd256(<2 x double> %a) { ; CHECK-LABEL: combine_permd_as_vpbroadcastsd256: ; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 22f5a0fba1d1..9ab56a308e14 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ @@ -200,6 +199,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: @@ -212,6 +212,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> @@ -228,6 +229,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: @@ -238,6 +240,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> @@ -256,6 +259,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: @@ -268,6 +272,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> @@ -286,6 +291,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: @@ -298,6 +304,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 ; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> @@ -319,6 +326,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: @@ -333,6 +341,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector <8 x i1> , <8 x i1> %b, <8 x i32> @@ -353,6 +362,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL %AL %EAX ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: @@ -366,6 +376,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 ; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AL %AL %EAX ; VL_BW_DQ-NEXT: retq %c = shufflevector <8 x i1> , <8 x i1> %a, <8 x i32> %c1 = bitcast <8 x i1>%c to i8 @@ -382,6 +393,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AX %AX %EAX ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: @@ -392,6 +404,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovw %k0, %eax +; VL_BW_DQ-NEXT: # kill: %AX %AX %EAX ; VL_BW_DQ-NEXT: retq %b = bitcast i16 %a to <16 x i1> %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer @@ -400,6 +413,40 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { } define i64 @shuf64i1_zero(i64 %a) { +; AVX512F-LABEL: shuf64i1_zero: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: .Ltmp0: +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: .Ltmp1: +; AVX512F-NEXT: .cfi_offset %rbp, -16 +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: .Ltmp2: +; AVX512F-NEXT: .cfi_def_cfa_register %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $96, %rsp +; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, (%rsp) +; AVX512F-NEXT: movl (%rsp), %ecx +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shlq $32, %rax +; AVX512F-NEXT: orq %rcx, %rax +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; ; VL_BW_DQ-LABEL: shuf64i1_zero: ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: kmovq %rdi, %k0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index 7dfbd565223e..d130e7ff00b2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -198,6 +198,12 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind { ; SSE2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: ; SSE2: # BB#0: +; SSE2-NEXT: # kill: %R9D %R9D %R9 +; SSE2-NEXT: # kill: %R8D %R8D %R8 +; SSE2-NEXT: # kill: %ECX %ECX %RCX +; SSE2-NEXT: # kill: %EDX %EDX %RDX +; SSE2-NEXT: # kill: %ESI %ESI %RSI +; SSE2-NEXT: # kill: %EDI %EDI %RDI ; SSE2-NEXT: movswq %di, %rax ; SSE2-NEXT: movswq %si, %rsi ; SSE2-NEXT: movswq %dx, %rdx @@ -234,6 +240,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: ; SSSE3: # BB#0: +; SSSE3-NEXT: # kill: %R9D %R9D %R9 +; SSSE3-NEXT: # kill: %R8D %R8D %R8 +; SSSE3-NEXT: # kill: %ECX %ECX %RCX +; SSSE3-NEXT: # kill: %EDX %EDX %RDX +; SSSE3-NEXT: # kill: %ESI %ESI %RSI +; SSSE3-NEXT: # kill: %EDI %EDI %RDI ; SSSE3-NEXT: movswq %di, %rax ; SSSE3-NEXT: movswq %si, %rsi ; SSSE3-NEXT: movswq %dx, %rdx @@ -271,6 +283,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: ; SSE41: # BB#0: ; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: # kill: %R9D %R9D %R9 +; SSE41-NEXT: # kill: %R8D %R8D %R8 +; SSE41-NEXT: # kill: %ECX %ECX %RCX +; SSE41-NEXT: # kill: %EDX %EDX %RDX +; SSE41-NEXT: # kill: %ESI %ESI %RSI +; SSE41-NEXT: # kill: %EDI %EDI %RDI ; SSE41-NEXT: movswq %di, %rax ; SSE41-NEXT: movswq %si, %rbx ; SSE41-NEXT: movswq %dx, %r11 @@ -298,6 +316,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; AVX: # BB#0: ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %rbx +; AVX-NEXT: # kill: %R9D %R9D %R9 +; AVX-NEXT: # kill: %R8D %R8D %R8 +; AVX-NEXT: # kill: %ECX %ECX %RCX +; AVX-NEXT: # kill: %EDX %EDX %RDX +; AVX-NEXT: # kill: %ESI %ESI %RSI +; AVX-NEXT: # kill: %EDI %EDI %RDI ; AVX-NEXT: movswq %di, %r10 ; AVX-NEXT: movswq %si, %r11 ; AVX-NEXT: movswq %dx, %r14 @@ -343,6 +367,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %i0, i8 %i1, i8 %i2, i8 %i3, i8 %i4, i8 %i5, i8 %i6, i8 %i7, i8 %i8, i8 %i9, i8 %i10, i8 %i11, i8 %i12, i8 %i13, i8 %i14, i8 %i15) nounwind { ; SSE2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSE2: # BB#0: +; SSE2-NEXT: # kill: %R9D %R9D %R9 +; SSE2-NEXT: # kill: %R8D %R8D %R8 +; SSE2-NEXT: # kill: %ECX %ECX %RCX +; SSE2-NEXT: # kill: %EDX %EDX %RDX +; SSE2-NEXT: # kill: %ESI %ESI %RSI +; SSE2-NEXT: # kill: %EDI %EDI %RDI ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %r11 @@ -412,6 +442,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; ; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSSE3: # BB#0: +; SSSE3-NEXT: # kill: %R9D %R9D %R9 +; SSSE3-NEXT: # kill: %R8D %R8D %R8 +; SSSE3-NEXT: # kill: %ECX %ECX %RCX +; SSSE3-NEXT: # kill: %EDX %EDX %RDX +; SSSE3-NEXT: # kill: %ESI %ESI %RSI +; SSSE3-NEXT: # kill: %EDI %EDI %RDI ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %r10 ; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r11 @@ -487,6 +523,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: # kill: %R9D %R9D %R9 +; SSE41-NEXT: # kill: %R8D %R8D %R8 +; SSE41-NEXT: # kill: %ECX %ECX %RCX +; SSE41-NEXT: # kill: %EDX %EDX %RDX +; SSE41-NEXT: # kill: %ESI %ESI %RSI +; SSE41-NEXT: # kill: %EDI %EDI %RDI ; SSE41-NEXT: movsbq %dil, %r15 ; SSE41-NEXT: movsbq %sil, %r14 ; SSE41-NEXT: movsbq %dl, %r11 @@ -548,6 +590,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx +; AVX-NEXT: # kill: %R9D %R9D %R9 +; AVX-NEXT: # kill: %R8D %R8D %R8 +; AVX-NEXT: # kill: %ECX %ECX %RCX +; AVX-NEXT: # kill: %EDX %EDX %RDX +; AVX-NEXT: # kill: %ESI %ESI %RSI +; AVX-NEXT: # kill: %EDI %EDI %RDI ; AVX-NEXT: movsbq %dil, %r10 ; AVX-NEXT: movsbq %sil, %r11 ; AVX-NEXT: movsbq %dl, %r14 @@ -1097,6 +1145,12 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %y, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind { ; SSE2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: ; SSE2: # BB#0: +; SSE2-NEXT: # kill: %R9D %R9D %R9 +; SSE2-NEXT: # kill: %R8D %R8D %R8 +; SSE2-NEXT: # kill: %ECX %ECX %RCX +; SSE2-NEXT: # kill: %EDX %EDX %RDX +; SSE2-NEXT: # kill: %ESI %ESI %RSI +; SSE2-NEXT: # kill: %EDI %EDI %RDI ; SSE2-NEXT: movswq %di, %r10 ; SSE2-NEXT: movswq %si, %rsi ; SSE2-NEXT: movswq %dx, %r11 @@ -1130,6 +1184,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: ; SSSE3: # BB#0: +; SSSE3-NEXT: # kill: %R9D %R9D %R9 +; SSSE3-NEXT: # kill: %R8D %R8D %R8 +; SSSE3-NEXT: # kill: %ECX %ECX %RCX +; SSSE3-NEXT: # kill: %EDX %EDX %RDX +; SSSE3-NEXT: # kill: %ESI %ESI %RSI +; SSSE3-NEXT: # kill: %EDI %EDI %RDI ; SSSE3-NEXT: movswq %di, %r10 ; SSSE3-NEXT: movswq %si, %rsi ; SSSE3-NEXT: movswq %dx, %r11 @@ -1163,6 +1223,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: ; SSE41: # BB#0: +; SSE41-NEXT: # kill: %R9D %R9D %R9 +; SSE41-NEXT: # kill: %R8D %R8D %R8 +; SSE41-NEXT: # kill: %ECX %ECX %RCX +; SSE41-NEXT: # kill: %EDX %EDX %RDX +; SSE41-NEXT: # kill: %ESI %ESI %RSI +; SSE41-NEXT: # kill: %EDI %EDI %RDI ; SSE41-NEXT: movswq %di, %rax ; SSE41-NEXT: movswq %si, %rsi ; SSE41-NEXT: movswq %dx, %rdx @@ -1184,6 +1250,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; ; AVX1-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: ; AVX1: # BB#0: +; AVX1-NEXT: # kill: %R9D %R9D %R9 +; AVX1-NEXT: # kill: %R8D %R8D %R8 +; AVX1-NEXT: # kill: %ECX %ECX %RCX +; AVX1-NEXT: # kill: %EDX %EDX %RDX +; AVX1-NEXT: # kill: %ESI %ESI %RSI +; AVX1-NEXT: # kill: %EDI %EDI %RDI ; AVX1-NEXT: movswq %di, %r10 ; AVX1-NEXT: movswq %si, %r11 ; AVX1-NEXT: movswq %dx, %rdx @@ -1205,6 +1277,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; ; AVX2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: ; AVX2: # BB#0: +; AVX2-NEXT: # kill: %R9D %R9D %R9 +; AVX2-NEXT: # kill: %R8D %R8D %R8 +; AVX2-NEXT: # kill: %ECX %ECX %RCX +; AVX2-NEXT: # kill: %EDX %EDX %RDX +; AVX2-NEXT: # kill: %ESI %ESI %RSI +; AVX2-NEXT: # kill: %EDI %EDI %RDI ; AVX2-NEXT: movswq %di, %r10 ; AVX2-NEXT: movswq %si, %r11 ; AVX2-NEXT: movswq %dx, %rdx diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 77791b0e28e4..db147a024e24 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -36,6 +36,7 @@ define <4 x i32> @trunc_add_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -43,6 +44,7 @@ define <4 x i32> @trunc_add_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -105,6 +107,7 @@ define <8 x i16> @trunc_add_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -148,6 +151,7 @@ define <8 x i16> @trunc_add_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -155,6 +159,7 @@ define <8 x i16> @trunc_add_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -379,6 +384,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = add <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -421,6 +427,7 @@ define <4 x i32> @trunc_add_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -428,6 +435,7 @@ define <4 x i32> @trunc_add_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -495,6 +503,7 @@ define <8 x i16> @trunc_add_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -537,6 +546,7 @@ define <8 x i16> @trunc_add_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -544,6 +554,7 @@ define <8 x i16> @trunc_add_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -767,6 +778,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = add <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -804,6 +816,7 @@ define <4 x i32> @trunc_sub_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -811,6 +824,7 @@ define <4 x i32> @trunc_sub_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -873,6 +887,7 @@ define <8 x i16> @trunc_sub_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -916,6 +931,7 @@ define <8 x i16> @trunc_sub_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -923,6 +939,7 @@ define <8 x i16> @trunc_sub_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -1147,6 +1164,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = sub <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1189,6 +1207,7 @@ define <4 x i32> @trunc_sub_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1196,6 +1215,7 @@ define <4 x i32> @trunc_sub_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -1262,6 +1282,7 @@ define <8 x i16> @trunc_sub_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1304,6 +1325,7 @@ define <8 x i16> @trunc_sub_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1311,6 +1333,7 @@ define <8 x i16> @trunc_sub_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -1534,6 +1557,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = sub <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1615,6 +1639,7 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1630,6 +1655,7 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 ; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = mul <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -1780,6 +1806,7 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1843,6 +1870,7 @@ define <8 x i16> @trunc_mul_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1850,6 +1878,7 @@ define <8 x i16> @trunc_mul_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2298,6 +2327,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = mul <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -2365,6 +2395,7 @@ define <4 x i32> @trunc_mul_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2377,6 +2408,7 @@ define <4 x i32> @trunc_mul_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 ; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = mul <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -2495,6 +2527,7 @@ define <8 x i16> @trunc_mul_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2556,6 +2589,7 @@ define <8 x i16> @trunc_mul_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2563,6 +2597,7 @@ define <8 x i16> @trunc_mul_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2930,6 +2965,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = mul <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -2965,6 +3001,7 @@ define <4 x i32> @trunc_and_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2972,6 +3009,7 @@ define <4 x i32> @trunc_and_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3030,6 +3068,7 @@ define <8 x i16> @trunc_and_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3071,6 +3110,7 @@ define <8 x i16> @trunc_and_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3078,6 +3118,7 @@ define <8 x i16> @trunc_and_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3288,6 +3329,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = and <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -3326,6 +3368,7 @@ define <4 x i32> @trunc_and_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3333,6 +3376,7 @@ define <4 x i32> @trunc_and_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3395,6 +3439,7 @@ define <8 x i16> @trunc_and_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3436,6 +3481,7 @@ define <8 x i16> @trunc_and_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3443,6 +3489,7 @@ define <8 x i16> @trunc_and_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3656,6 +3703,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = and <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -3691,6 +3739,7 @@ define <4 x i32> @trunc_xor_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3698,6 +3747,7 @@ define <4 x i32> @trunc_xor_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3756,6 +3806,7 @@ define <8 x i16> @trunc_xor_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3797,6 +3848,7 @@ define <8 x i16> @trunc_xor_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3804,6 +3856,7 @@ define <8 x i16> @trunc_xor_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4014,6 +4067,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = xor <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4052,6 +4106,7 @@ define <4 x i32> @trunc_xor_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4059,6 +4114,7 @@ define <4 x i32> @trunc_xor_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4121,6 +4177,7 @@ define <8 x i16> @trunc_xor_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4162,6 +4219,7 @@ define <8 x i16> @trunc_xor_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4169,6 +4227,7 @@ define <8 x i16> @trunc_xor_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4382,6 +4441,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = xor <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4417,6 +4477,7 @@ define <4 x i32> @trunc_or_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4424,6 +4485,7 @@ define <4 x i32> @trunc_or_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4482,6 +4544,7 @@ define <8 x i16> @trunc_or_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4523,6 +4586,7 @@ define <8 x i16> @trunc_or_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4530,6 +4594,7 @@ define <8 x i16> @trunc_or_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4740,6 +4805,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = or <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4778,6 +4844,7 @@ define <4 x i32> @trunc_or_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4785,6 +4852,7 @@ define <4 x i32> @trunc_or_const_v4i64_4i32(<4 x i64> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4847,6 +4915,7 @@ define <8 x i16> @trunc_or_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4888,6 +4957,7 @@ define <8 x i16> @trunc_or_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4895,6 +4965,7 @@ define <8 x i16> @trunc_or_const_v16i32_v16i16(<8 x i32> %a0) nounwind { ; AVX512: # BB#0: ; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -5108,6 +5179,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq %1 = or <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5213,6 +5285,7 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5231,6 +5304,7 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index fb6c41ab1ce4..fddd10dd722f 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -143,6 +143,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -251,12 +252,15 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512BW-LABEL: trunc8i32_8i16: ; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq entry: %0 = trunc <8 x i32> %a to <8 x i16> @@ -314,6 +318,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; ; AVX512BW-LABEL: trunc8i32_8i8: ; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovq %xmm0, (%rax) @@ -434,6 +439,8 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512BW-LABEL: trunc2x4i64_8i32: ; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -534,6 +541,8 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512BW-LABEL: trunc2x4i64_8i16: ; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] diff --git a/llvm/test/CodeGen/X86/widen_bitops-0.ll b/llvm/test/CodeGen/X86/widen_bitops-0.ll index 9d9c696bb6d3..f8316d0e1ea2 100644 --- a/llvm/test/CodeGen/X86/widen_bitops-0.ll +++ b/llvm/test/CodeGen/X86/widen_bitops-0.ll @@ -141,6 +141,9 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X32-SSE-NEXT: pextrb $0, %xmm1, %eax ; X32-SSE-NEXT: pextrb $4, %xmm1, %edx ; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X32-SSE-NEXT: # kill: %AL %AL %EAX +; X32-SSE-NEXT: # kill: %DL %DL %EDX +; X32-SSE-NEXT: # kill: %CL %CL %ECX ; X32-SSE-NEXT: retl ; ; X64-SSE-LABEL: and_v3i8_as_i24: @@ -155,6 +158,9 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X64-SSE-NEXT: pextrb $0, %xmm1, %eax ; X64-SSE-NEXT: pextrb $4, %xmm1, %edx ; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X64-SSE-NEXT: # kill: %AL %AL %EAX +; X64-SSE-NEXT: # kill: %DL %DL %EDX +; X64-SSE-NEXT: # kill: %CL %CL %ECX ; X64-SSE-NEXT: retq %1 = bitcast <3 x i8> %a to i24 %2 = bitcast <3 x i8> %b to i24 @@ -176,6 +182,9 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X32-SSE-NEXT: pextrb $0, %xmm1, %eax ; X32-SSE-NEXT: pextrb $4, %xmm1, %edx ; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X32-SSE-NEXT: # kill: %AL %AL %EAX +; X32-SSE-NEXT: # kill: %DL %DL %EDX +; X32-SSE-NEXT: # kill: %CL %CL %ECX ; X32-SSE-NEXT: retl ; ; X64-SSE-LABEL: xor_v3i8_as_i24: @@ -190,6 +199,9 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X64-SSE-NEXT: pextrb $0, %xmm1, %eax ; X64-SSE-NEXT: pextrb $4, %xmm1, %edx ; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X64-SSE-NEXT: # kill: %AL %AL %EAX +; X64-SSE-NEXT: # kill: %DL %DL %EDX +; X64-SSE-NEXT: # kill: %CL %CL %ECX ; X64-SSE-NEXT: retq %1 = bitcast <3 x i8> %a to i24 %2 = bitcast <3 x i8> %b to i24 @@ -211,6 +223,9 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X32-SSE-NEXT: pextrb $0, %xmm1, %eax ; X32-SSE-NEXT: pextrb $4, %xmm1, %edx ; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X32-SSE-NEXT: # kill: %AL %AL %EAX +; X32-SSE-NEXT: # kill: %DL %DL %EDX +; X32-SSE-NEXT: # kill: %CL %CL %ECX ; X32-SSE-NEXT: retl ; ; X64-SSE-LABEL: or_v3i8_as_i24: @@ -225,6 +240,9 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X64-SSE-NEXT: pextrb $0, %xmm1, %eax ; X64-SSE-NEXT: pextrb $4, %xmm1, %edx ; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx +; X64-SSE-NEXT: # kill: %AL %AL %EAX +; X64-SSE-NEXT: # kill: %DL %DL %EDX +; X64-SSE-NEXT: # kill: %CL %CL %ECX ; X64-SSE-NEXT: retq %1 = bitcast <3 x i8> %a to i24 %2 = bitcast <3 x i8> %b to i24 diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll index 00e851ce9469..2899e38b71cd 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll @@ -185,7 +185,7 @@ attributes #2 = { "no-frame-pointer-elim"="false" nounwind } ; CHECK-NEXT: je [[STRINGS_EQUAL]] ; ; CHECK: [[STRINGS_EQUAL]] -; CHECK-NEXT: popq +; CHECK: popq define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 { entry: %cmp.i = icmp eq i8* %vk1, null diff --git a/llvm/test/CodeGen/X86/xaluo.ll b/llvm/test/CodeGen/X86/xaluo.ll index 31e18989144a..eb0fd8649868 100644 --- a/llvm/test/CodeGen/X86/xaluo.ll +++ b/llvm/test/CodeGen/X86/xaluo.ll @@ -746,6 +746,7 @@ define i1 @bug27873(i64 %c1, i1 %c2) { ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: # kill ; KNL-NEXT: retq %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160) %mul.overflow = extractvalue { i64, i1 } %mul, 1