From 516e14cd8e34922553eb4b7e39ccb2934af1b7b7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 11 Jul 2016 05:36:48 +0000 Subject: [PATCH] [AVX512] Use vpternlog with an immediate of 0xff to create 512-bit all one vectors. llvm-svn: 275045 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- llvm/lib/Target/X86/X86InstrAVX512.td | 2 + llvm/lib/Target/X86/X86InstrInfo.cpp | 16 ++- llvm/test/CodeGen/X86/avx512-build-vector.ll | 7 +- llvm/test/CodeGen/X86/avx512-calling-conv.ll | 22 ++-- llvm/test/CodeGen/X86/avx512-cvt.ll | 6 +- llvm/test/CodeGen/X86/avx512-ext.ll | 6 +- llvm/test/CodeGen/X86/avx512-mask-op.ll | 114 ++++++++++-------- llvm/test/CodeGen/X86/avx512-vbroadcast.ll | 3 +- llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 6 +- llvm/test/CodeGen/X86/masked_memop.ll | 9 +- .../CodeGen/X86/vector-compare-results.ll | 12 +- llvm/test/CodeGen/X86/vector-sext.ll | 33 +++-- .../X86/vector-shuffle-combining-avx512bw.ll | 4 +- llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 88 +++++++------- 15 files changed, 195 insertions(+), 139 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4cbffcc2f211..8cb528ec9e35 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6554,11 +6554,11 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { - if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget.hasInt256())) + if (VT == MVT::v4i32 || VT == MVT::v16i32 || + (VT == MVT::v8i32 && Subtarget.hasInt256())) return Op; - if (!VT.is512BitVector()) - return getOnesVector(VT, Subtarget, DAG, DL); + return getOnesVector(VT, Subtarget, DAG, DL); } return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 357b51797184..0b50b82b154a 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -416,6 +416,8 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in { def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllZerosV))]>; +def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", + [(set VR512:$dst, (v16i32 immAllOnesV))]>; } let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index cc0388bb0db6..a6a1714c21ab 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5547,6 +5547,15 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::AVX512_512_SETALLONES: { + unsigned Reg = MIB->getOperand(0).getReg(); + MIB->setDesc(get(X86::VPTERNLOGDZrri)); + // VPTERNLOGD needs 3 register inputs and an immediate. + // 0xff will return 1s for any input. + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef).addImm(0xff); + return true; + } case X86::TEST8ri_NOREX: MI.setDesc(get(X86::TEST8ri)); return true; @@ -6231,6 +6240,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( else switch (LoadMI.getOpcode()) { case X86::AVX512_512_SET0: + case X86::AVX512_512_SETALLONES: Alignment = 64; break; case X86::AVX2_SETALLONES: @@ -6281,6 +6291,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: + case X86::AVX512_512_SETALLONES: case X86::FsFLD0SD: case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. @@ -6312,7 +6323,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Ty = Type::getFloatTy(MF.getFunction()->getContext()); else if (Opc == X86::FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); - else if (Opc == X86::AVX512_512_SET0) + else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || Opc == X86::AVX512_256_SET0) @@ -6320,7 +6331,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES); + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || + Opc == X86::AVX512_512_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll index 0f89aa71162e..980b87187d98 100644 --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -4,7 +4,8 @@ define <16 x i32> @test2(<16 x i32> %x) { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = add <16 x i32>, %x ret <16 x i32>%res @@ -15,8 +16,8 @@ define <16 x float> @test3(<4 x float> %a) { ; CHECK: ## BB#0: ; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 303e7ac51824..35e7448e09bc 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -30,7 +30,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -52,7 +53,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpbroadcastd LCPI1_0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <16 x i1>%a, %b @@ -68,7 +70,8 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -91,7 +94,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL_X32-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0 +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: retl @@ -183,7 +186,8 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { ; KNL-NEXT: Ltmp1: ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: callq _func16xi1 ; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -212,7 +216,8 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { ; KNL_X32-NEXT: Ltmp1: ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; KNL_X32-NEXT: vpbroadcastd LCPI5_0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 ; KNL_X32-NEXT: calll _func16xi1 ; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -286,7 +291,8 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL-NEXT: movb $85, %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq @@ -322,7 +328,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL_X32-NEXT: movb $85, %al ; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0 +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: addl $12, %esp diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 57e8a1341513..914f859927be 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -685,7 +685,8 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) { ; KNL: ## BB#0: ; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; KNL-NEXT: retq ; @@ -748,7 +749,8 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) { ; KNL: ## BB#0: ; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; KNL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index a944e85f71b0..dac40b994289 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1409,7 +1409,8 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ; KNL-NEXT: knotw %k0, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: retq ; @@ -1465,7 +1466,8 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { ; KNL-LABEL: sext_16i1_16i32: ; KNL: ## BB#0: ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: sext_16i1_16i32: diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index b867297df741..af41de109e11 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -349,7 +349,8 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; KNL-NEXT: LBB17_1: ; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 ; KNL-NEXT: LBB17_3: -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -386,7 +387,8 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { ; KNL-NEXT: LBB18_3: ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -475,7 +477,8 @@ define <16 x i1> @test15(i32 %x, i32 %y) { ; KNL-NEXT: movw $1, %cx ; KNL-NEXT: cmovgw %ax, %cx ; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -512,25 +515,25 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: movl %edi, (%rsp) ; KNL-NEXT: shrq $32, %rdi ; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; KNL-NEXT: movl {{.*}}(%rip), %eax +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: kmovw (%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; KNL-NEXT: movl $1, %ecx -; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z} +; KNL-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2 +; KNL-NEXT: movl $1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; KNL-NEXT: vpsllw $7, %ymm2, %ymm0 ; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 @@ -570,30 +573,30 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: movl %edi, (%rsp) ; KNL-NEXT: shrq $32, %rdi ; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; KNL-NEXT: movl {{.*}}(%rip), %eax +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; KNL-NEXT: kmovw (%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; KNL-NEXT: xorl %ecx, %ecx +; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: cmpl %edx, %esi -; KNL-NEXT: setg %cl -; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: setg %al +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 ; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z} +; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq @@ -628,7 +631,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $7, %k2, %k1 ; KNL-NEXT: korw %k1, %k0, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -1368,7 +1372,8 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: load_8i1: @@ -1385,7 +1390,8 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) { ; KNL-LABEL: load_16i1: ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: load_16i1: @@ -1403,7 +1409,8 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; @@ -1422,7 +1429,8 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq @@ -1441,11 +1449,11 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) { ; KNL-LABEL: load_32i1: ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 -; KNL-NEXT: movl {{.*}}(%rip), %eax -; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: kmovw 2(%rdi), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm1, %ymm1 ; KNL-NEXT: retq ; @@ -1463,20 +1471,20 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) { ; KNL-LABEL: load_64i1: ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 -; KNL-NEXT: movl {{.*}}(%rip), %eax -; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: kmovw 2(%rdi), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: kmovw 4(%rdi), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: kmovw 6(%rdi), %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z} +; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; KNL-NEXT: kmovw 4(%rdi), %k1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: kmovw 6(%rdi), %k1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; KNL-NEXT: retq ; ; SKX-LABEL: load_64i1: diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll index 6a7ed02e0311..299b990f6254 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -218,7 +218,8 @@ define <16 x i32> @test_vbroadcast() { ; ALL: # BB#0: # %entry ; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1 -; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: knotw %k1, %k1 ; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 49aea228182a..69be3685ecd1 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -865,7 +865,8 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1 ; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 ; KNL-NEXT: kxnorw %k1, %k0, %k1 -; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: retq ; @@ -889,7 +890,8 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> ; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k1 -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index c31b8381aebd..4220308b0086 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -2473,7 +2473,8 @@ define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x ; AVX512F-NEXT: ## BB#31: ## %cond.load43 ; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: LBB50_32: ## %else44 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -5676,7 +5677,8 @@ define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: ## BB#15: ## %cond.load19 ; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: LBB53_16: ## %else20 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm2 @@ -6116,7 +6118,8 @@ define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 ; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: LBB54_32: ## %else44 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index 16bf596f3bbf..9c89d0129f85 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -706,7 +706,8 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX512-LABEL: test_cmp_v8f64: ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = fcmp ogt <8 x double> %a0, %a1 @@ -767,7 +768,8 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { ; AVX512-LABEL: test_cmp_v16f32: ; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = fcmp ogt <16 x float> %a0, %a1 @@ -890,7 +892,8 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512-LABEL: test_cmp_v8i64: ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = icmp sgt <8 x i64> %a0, %a1 @@ -954,7 +957,8 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512-LABEL: test_cmp_v16i32: ; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = icmp sgt <16 x i32> %a0, %a1 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index e3daba5eb166..672eeac0c7f2 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -785,7 +785,8 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movzbl (%rdi), %eax ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; @@ -966,7 +967,8 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movzbl (%rdi), %eax ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: retq @@ -1162,7 +1164,8 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movzbl (%rdi), %eax ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq ; @@ -1455,7 +1458,8 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movzbl (%rdi), %eax ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1848,7 +1852,8 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movzbl (%rdi), %eax ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: retq ; @@ -2350,7 +2355,8 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX512-LABEL: load_sext_16i1_to_16i8: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -2887,7 +2893,8 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX512-LABEL: load_sext_16i1_to_16i16: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: retq ; @@ -3731,13 +3738,13 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX512-LABEL: load_sext_32i1_to_32i8: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: movl {{.*}}(%rip), %eax -; AVX512-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: kmovw 2(%rdi), %k1 -; AVX512-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: kmovw 2(%rdi), %k1 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_32i1_to_32i8: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index ef492e053eb2..ddb83c604307 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -358,9 +358,9 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; CHECK-LABEL: combine_pshufb_identity_mask: ; CHECK: # BB#0: ; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vmovdqu8 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; CHECK-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; CHECK-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 9ab56a308e14..1c128645ad14 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -74,13 +74,13 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512F-NEXT: movq {{.*}}(%rip), %rax -; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -105,14 +105,14 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vpslld $31, %zmm1, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -163,13 +163,13 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: movq {{.*}}(%rip), %rax -; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -192,7 +192,8 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 @@ -224,7 +225,8 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -252,7 +254,8 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 @@ -284,7 +287,8 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 @@ -316,14 +320,14 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: movq {{.*}}(%rip), %rax -; AVX512F-NEXT: movb $51, %cl -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vpbroadcastq %rax, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: movb $51, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: %AL %AL %EAX @@ -355,11 +359,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: %AL %AL %EAX @@ -371,7 +375,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] -; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 +; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 ; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -388,7 +392,8 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -426,8 +431,9 @@ define i64 @shuf64i1_zero(i64 %a) { ; AVX512F-NEXT: andq $-32, %rsp ; AVX512F-NEXT: subq $96, %rsp ; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1