diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 74e91c38fc9c..e6b7eb5bf531 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -4986,3 +4986,47 @@ define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1 declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone +define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) { +; CHECK-LABEL: bad_mask_transition: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vcmplt_oqpd %zmm3, %zmm2, %k0 +; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: kmovw %ecx, %k1 +; CHECK-NEXT: kunpckbw %k0, %k1, %k1 +; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1} +; CHECK-NEXT: retq +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i8 -1, i32 4) + %1 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i8 -1, i32 4) + %conv = zext i8 %0 to i16 + %conv2 = zext i8 %1 to i16 + %2 = bitcast i16 %conv to <16 x i1> + %3 = bitcast i16 %conv2 to <16 x i1> + %4 = shufflevector <16 x i1> %2, <16 x i1> undef, <8 x i32> + %5 = shufflevector <16 x i1> %3, <16 x i1> undef, <8 x i32> + %6 = shufflevector <8 x i1> %4, <8 x i1> %5, <16 x i32> + %7 = select <16 x i1> %6, <16 x float> %f, <16 x float> %e + ret <16 x float> %7 +} + +define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) { +; CHECK-LABEL: bad_mask_transition_2: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1} +; CHECK-NEXT: retq +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i8 -1, i32 4) + %conv = zext i8 %0 to i16 + %1 = bitcast i16 %conv to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %f, <16 x float> %e + ret <16 x float> %2 +} diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index cce2f43164d8..6e9b286810f2 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -2747,3 +2747,71 @@ bb.1: bb.2: ret void } + +; This is derived from an intrinsic test where v4i1 mask was created by _mm_cmp_epi32_mask, then it was passed to _mm512_mask_blend_epi32 which uses a v16i1 mask. +; The widening happens in the scalar domain between the intrinsics. The middle end optmized it to this. +define <8 x i64> @mask_widening(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d, <8 x i64> %e, <8 x i64> %f) { +; KNL-LABEL: mask_widening: +; KNL: ## %bb.0: ## %entry +; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k1 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15] +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: mask_widening: +; SKX: ## %bb.0: ## %entry +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15] +; SKX-NEXT: vpmovd2m %zmm0, %k1 +; SKX-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} +; SKX-NEXT: retq +; +; AVX512BW-LABEL: mask_widening: +; AVX512BW: ## %bb.0: ## %entry +; AVX512BW-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512BW-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mask_widening: +; AVX512DQ: ## %bb.0: ## %entry +; AVX512DQ-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512DQ-NEXT: kshiftlb $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $4, %k0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[8,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vpblendmd %zmm5, %zmm4, %zmm0 {%k1} +; AVX512DQ-NEXT: retq +entry: + %0 = bitcast <2 x i64> %a to <4 x i32> + %1 = bitcast <2 x i64> %b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i64> %f to <16 x i32> + %5 = bitcast <8 x i64> %e to <16 x i32> + %6 = shufflevector <8 x i1> %3, <8 x i1> , <16 x i32> + %7 = select <16 x i1> %6, <16 x i32> %4, <16 x i32> %5 + %8 = bitcast <16 x i32> %7 to <8 x i64> + ret <8 x i64> %8 +} +