From afce0baacd6a10e891518d3f584600de366a5e0b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 30 Aug 2017 16:38:33 +0000 Subject: [PATCH] [AVX512] Don't use 32-bit elements version of AND/OR/XOR/ANDN during isel unless we're matching a masked op or broadcast Selecting 32-bit element logical ops without a select or broadcast requires matching a bitconvert on the inputs to the and. But that's a weird thing to rely on. It's entirely possible that one of the inputs doesn't have a bitcast and one does. Since there's no functional difference, just remove the extra patterns and save some isel table size. Differential Revision: https://reviews.llvm.org/D36854 llvm-svn: 312138 --- llvm/lib/Target/X86/X86InstrAVX512.td | 66 ++++++++++--------- llvm/test/CodeGen/X86/avx512-arith.ll | 6 +- .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 6 +- llvm/test/CodeGen/X86/avx512-logic.ll | 10 +-- .../X86/broadcast-elm-cross-splat-vec.ll | 4 +- llvm/test/CodeGen/X86/vector-bitreverse.ll | 6 +- llvm/test/CodeGen/X86/vector-lzcnt-512.ll | 8 +-- llvm/test/CodeGen/X86/vector-rotate-512.ll | 2 +- llvm/test/CodeGen/X86/vector-trunc-math.ll | 6 +- llvm/test/CodeGen/X86/vector-tzcnt-512.ll | 16 ++--- 10 files changed, 66 insertions(+), 64 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 34bce9bf5a06..bb5c12d70d00 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5094,41 +5094,51 @@ let Predicates = [HasDQI, NoVLX] in { // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -multiclass avx512_logic_rm opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable = 0> { +// OpNodeMsk is the OpNode to use when element size is important. OpNode will +// be set to null_frag for 32-bit elements. +multiclass avx512_logic_rm opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, X86VectorVTInfo _, + bit IsCommutable = 0> { + let hasSideEffects = 0 in defm rr : AVX512_maskable_logic, AVX512BIBase, EVEX_4V; + let hasSideEffects = 0, mayLoad = 1 in defm rm : AVX512_maskable_logic, AVX512BIBase, EVEX_4V; } -multiclass avx512_logic_rmb opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable = 0> : - avx512_logic_rm { +// OpNodeMsk is the OpNode to use where element size is important. So use +// for all of the broadcast patterns. +multiclass avx512_logic_rmb opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, X86VectorVTInfo _, + bit IsCommutable = 0> : + avx512_logic_rm { defm rmb : AVX512_maskable_logic opc, string OpcodeStr, SDNode OpNode, AVX512BIBase, EVEX_4V, EVEX_B; } -multiclass avx512_logic_rmb_vl opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, +multiclass avx512_logic_rmb_vl opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, AVX512VLVectorVTInfo VTInfo, bit IsCommutable = 0> { let Predicates = [HasAVX512] in - defm Z : avx512_logic_rmb, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_logic_rmb, EVEX_V256; - defm Z128 : avx512_logic_rmb, EVEX_V128; + defm Z256 : avx512_logic_rmb, EVEX_V256; + defm Z128 : avx512_logic_rmb, EVEX_V128; } } -multiclass avx512_logic_rm_vl_d opc, string OpcodeStr, SDNode OpNode, - bit IsCommutable = 0> { - defm NAME : avx512_logic_rmb_vl, EVEX_CD8<32, CD8VF>; -} - -multiclass avx512_logic_rm_vl_q opc, string OpcodeStr, SDNode OpNode, - bit IsCommutable = 0> { - defm NAME : avx512_logic_rmb_vl, - VEX_W, EVEX_CD8<64, CD8VF>; -} - multiclass avx512_logic_rm_vl_dq opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, bit IsCommutable = 0> { - defm Q : avx512_logic_rm_vl_q; - defm D : avx512_logic_rm_vl_d; + defm Q : avx512_logic_rmb_vl, + VEX_W, EVEX_CD8<64, CD8VF>; + defm D : avx512_logic_rmb_vl, + EVEX_CD8<32, CD8VF>; } defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>; diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index debfb2974fac..1bcd3c60e3b2 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -607,17 +607,17 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { ; AVX512F-LABEL: andd512fold: ; AVX512F: # BB#0: # %entry -; AVX512F-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vpandq (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: andd512fold: ; AVX512VL: # BB#0: # %entry -; AVX512VL-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vpandq (%rdi), %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: andd512fold: ; AVX512BW: # BB#0: # %entry -; AVX512BW-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: andd512fold: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index afb463d9fe47..0e4a88bd0b44 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -959,7 +959,7 @@ define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) { define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_xor_epi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res @@ -981,7 +981,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_or_epi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res @@ -1003,7 +1003,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_and_epi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll index 6e08753dbbb1..c96c63dd0a45 100644 --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -7,7 +7,7 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; ALL-LABEL: vpandd: ; ALL: ## BB#0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -21,7 +21,7 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno ; ALL-LABEL: vpandnd: ; ALL: ## BB#0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpandnd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -37,7 +37,7 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ; ALL-LABEL: vpord: ; ALL: ## BB#0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpord %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -51,7 +51,7 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; ALL-LABEL: vpxord: ; ALL: ## BB#0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -132,7 +132,7 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { ; KNL-LABEL: andd512fold: ; KNL: ## BB#0: ## %entry -; KNL-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; KNL-NEXT: vpandq (%rdi), %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: andd512fold: diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 14bdb3853b03..1194f96b01ab 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1335,7 +1335,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi32_i128: @@ -1369,7 +1369,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index af91d35a6ee0..485911280c69 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -2051,17 +2051,17 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_bitreverse_v16i32: diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index 2a200d093e18..a7ef5980a23a 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -176,7 +176,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-LABEL: testv16i32: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 @@ -206,7 +206,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-LABEL: testv16i32: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 @@ -263,7 +263,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-LABEL: testv16i32u: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 @@ -293,7 +293,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-LABEL: testv16i32u: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 9403ea12ff28..bf02f94b1612 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -696,7 +696,7 @@ define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { ; AVX512-LABEL: splatconstant_rotate_mask_v16i32: ; AVX512: # BB#0: ; AVX512-NEXT: vprold $4, %zmm0, %zmm0 -; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <16 x i32> %a, %lshr = lshr <16 x i32> %a, diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index f22bc95aa935..bbeb9055d05c 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -3116,7 +3116,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; ; AVX512-LABEL: trunc_and_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3830,7 +3830,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; ; AVX512-LABEL: trunc_xor_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -4544,7 +4544,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; ; AVX512-LABEL: trunc_or_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll index a604e41c031e..4d3858863e33 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -139,7 +139,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 @@ -175,7 +175,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -197,7 +197,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -219,7 +219,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 @@ -233,7 +233,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -243,7 +243,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -253,7 +253,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -275,7 +275,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0