diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_load.ll similarity index 68% rename from llvm/test/CodeGen/X86/masked_memop.ll rename to llvm/test/CodeGen/X86/masked_load.ll index c0359b35993b..334adca8079f 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX -; To test for the case where masked load/store is not legal, we should add a run with a target +; To test for the case where masked load is not legal, we should add a run with a target ; that does not have AVX, but that case should probably be a separate test file using less tests ; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov. @@ -32,30 +32,6 @@ define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, < } declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>) -define void @store_v1i32_v1i32(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) { -; AVX-LABEL: store_v1i32_v1i32: -; AVX: ## %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jne LBB1_2 -; AVX-NEXT: ## %bb.1: ## %cond.store -; AVX-NEXT: movl %edx, (%rsi) -; AVX-NEXT: LBB1_2: ## %else -; AVX-NEXT: retq -; -; AVX512-LABEL: store_v1i32_v1i32: -; AVX512: ## %bb.0: -; AVX512-NEXT: testl %edi, %edi -; AVX512-NEXT: jne LBB1_2 -; AVX512-NEXT: ## %bb.1: ## %cond.store -; AVX512-NEXT: movl %edx, (%rsi) -; AVX512-NEXT: LBB1_2: ## %else -; AVX512-NEXT: retq - %mask = icmp eq <1 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask) - ret void -} -declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>) - define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { ; AVX-LABEL: load_v2f64_v2i64: ; AVX: ## %bb.0: @@ -157,42 +133,6 @@ define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i3 ret <4 x i32> %res } -define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { -; AVX1-LABEL: store_v4i32_v4i32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_v4i32_v4i32: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_v4i32_v4i32: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: store_v4i32_v4i32: -; SKX: ## %bb.0: -; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} -; SKX-NEXT: retq - %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) - ret void -} - define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { ; AVX1-LABEL: load_v4f64_v4i32: ; AVX1: ## %bb.0: @@ -440,136 +380,6 @@ define <8 x i32> @load_zero_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr) { ret <8 x i32> %res } -define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { -; AVX1-LABEL: store_v8i32_v8i32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_v8i32_v8i32: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_v8i32_v8i32: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: store_v8i32_v8i32: -; SKX: ## %bb.0: -; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %mask = icmp eq <8 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) - ret void -} - -define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { -; AVX1-LABEL: store_v2f32_v2i32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_v2f32_v2i32: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_v2f32_v2i32: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $14, %k0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: store_v2f32_v2i32: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} -; SKX-NEXT: retq - %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) - ret void -} - -define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { -; AVX1-LABEL: store_v2i32_v2i32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_v2i32_v2i32: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_v2i32_v2i32: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX512F-NEXT: kshiftlw $14, %k0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: store_v2i32_v2i32: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} -; SKX-NEXT: retq - %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) - ret void -} - define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { ; AVX1-LABEL: load_v2f32_v2i32: ; AVX1: ## %bb.0: @@ -1023,130 +833,6 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) { ret <4 x i64> %res } -define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { -; AVX1-LABEL: test21: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: test21: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: test21: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: movw $15, %ax -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: test21: -; SKX: ## %bb.0: -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} -; SKX-NEXT: retq - %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>) - ret void -} - -; When only one element of the mask is set, reduce to a scalar store. - -define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) { -; AVX-LABEL: one_mask_bit_set1: -; AVX: ## %bb.0: -; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: one_mask_bit_set1: -; AVX512: ## %bb.0: -; AVX512-NEXT: vmovss %xmm0, (%rdi) -; AVX512-NEXT: retq - call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) - ret void -} - -; Choose a different element to show that the correct address offset is produced. - -define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) { -; AVX-LABEL: one_mask_bit_set2: -; AVX: ## %bb.0: -; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: one_mask_bit_set2: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi) -; AVX512-NEXT: retq - call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) - ret void -} - -; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. - -define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { -; AVX-LABEL: one_mask_bit_set3: -; AVX: ## %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovlps %xmm0, 16(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX512-LABEL: one_mask_bit_set3: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovlps %xmm0, 16(%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) - ret void -} - -; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. - -define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { -; AVX-LABEL: one_mask_bit_set4: -; AVX: ## %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovhpd %xmm0, 24(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX512-LABEL: one_mask_bit_set4: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) - ret void -} - -; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected. - -define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { -; AVX-LABEL: one_mask_bit_set5: -; AVX: ## %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-NEXT: vmovlps %xmm0, 48(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX512-LABEL: one_mask_bit_set5: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) - ret void -} - ; When only one element of the mask is set, reduce to a scalar load. define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) { @@ -1246,160 +932,25 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v ret <8 x double> %res } -; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed. -; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that. - -define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) { -; AVX-LABEL: trunc_mask: -; AVX: ## %bb.0: -; AVX-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_mask: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: trunc_mask: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpcmpgtd %xmm2, %xmm1, %k1 -; SKX-NEXT: vmovups %xmm0, (%rdi) {%k1} -; SKX-NEXT: retq - %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer - call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask) - ret void -} - -; SimplifyDemandedBits eliminates an ashr here. - -define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x double>* %p, <4 x i32> %masksrc) { -; AVX1-LABEL: masked_store_bool_mask_demand_trunc_sext: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: masked_store_bool_mask_demand_trunc_sext: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX2-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: masked_store_bool_mask_demand_trunc_sext: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vmovupd %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: masked_store_bool_mask_demand_trunc_sext: -; SKX: ## %bb.0: -; SKX-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 -; SKX-NEXT: vmovupd %ymm0, (%rdi) {%k1} -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %sext = sext <4 x i32> %masksrc to <4 x i64> - %boolmask = trunc <4 x i64> %sext to <4 x i1> - call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %x, <4 x double>* %p, i32 4, <4 x i1> %boolmask) - ret void -} - -; This needs to be widened to v4i32. -; This used to assert in type legalization. PR38436 -; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask. -define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { -; AVX1-LABEL: widen_masked_store: -; AVX1: ## %bb.0: -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vmovd %esi, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: widen_masked_store: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: vmovd %esi, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: widen_masked_store: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; SKX-LABEL: widen_masked_store: -; SKX: ## %bb.0: -; SKX-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 -; SKX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vmovdqa32 %xmm1, %xmm1 {%k1} {z} -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 -; SKX-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} -; SKX-NEXT: retq - call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) - ret void -} -declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>) - define i32 @pr38986(i1 %c, i32* %p) { ; AVX-LABEL: pr38986: ; AVX: ## %bb.0: ; AVX-NEXT: testb $1, %dil ; AVX-NEXT: ## implicit-def: $eax -; AVX-NEXT: je LBB44_2 +; AVX-NEXT: je LBB30_2 ; AVX-NEXT: ## %bb.1: ## %cond.load ; AVX-NEXT: movl (%rsi), %eax -; AVX-NEXT: LBB44_2: ## %else +; AVX-NEXT: LBB30_2: ## %else ; AVX-NEXT: retq ; ; AVX512-LABEL: pr38986: ; AVX512: ## %bb.0: ; AVX512-NEXT: testb $1, %dil ; AVX512-NEXT: ## implicit-def: $eax -; AVX512-NEXT: je LBB44_2 +; AVX512-NEXT: je LBB30_2 ; AVX512-NEXT: ## %bb.1: ## %cond.load ; AVX512-NEXT: movl (%rsi), %eax -; AVX512-NEXT: LBB44_2: ## %else +; AVX512-NEXT: LBB30_2: ## %else ; AVX512-NEXT: retq %vc = insertelement <1 x i1> undef, i1 %c, i32 0 %vp = bitcast i32* %p to <1 x i32>* @@ -1414,12 +965,6 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>) -declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) -declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) -declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) @@ -1427,8 +972,3 @@ declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) -declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) - diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll new file mode 100644 index 000000000000..bdc5e09b45e6 --- /dev/null +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -0,0 +1,470 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX + +; To test for the case where masked store is not legal, we should add a run with a target +; that does not have AVX, but that case should probably be a separate test file using less tests +; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov. + +define void @store_v1i32_v1i32(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) { +; AVX-LABEL: store_v1i32_v1i32: +; AVX: ## %bb.0: +; AVX-NEXT: testl %edi, %edi +; AVX-NEXT: jne LBB0_2 +; AVX-NEXT: ## %bb.1: ## %cond.store +; AVX-NEXT: movl %edx, (%rsi) +; AVX-NEXT: LBB0_2: ## %else +; AVX-NEXT: retq +; +; AVX512-LABEL: store_v1i32_v1i32: +; AVX512: ## %bb.0: +; AVX512-NEXT: testl %edi, %edi +; AVX512-NEXT: jne LBB0_2 +; AVX512-NEXT: ## %bb.1: ## %cond.store +; AVX512-NEXT: movl %edx, (%rsi) +; AVX512-NEXT: LBB0_2: ## %else +; AVX512-NEXT: retq + %mask = icmp eq <1 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask) + ret void +} +declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>) + +define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { +; AVX1-LABEL: store_v4i32_v4i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_v4i32_v4i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: store_v4i32_v4i32: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: store_v4i32_v4i32: +; SKX: ## %bb.0: +; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} +; SKX-NEXT: retq + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + ret void +} + +define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { +; AVX1-LABEL: store_v8i32_v8i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_v8i32_v8i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: store_v8i32_v8i32: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: store_v8i32_v8i32: +; SKX: ## %bb.0: +; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) + ret void +} + +define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { +; AVX1-LABEL: store_v2f32_v2i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_v2f32_v2i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: store_v2f32_v2i32: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k1 +; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: store_v2f32_v2i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} +; SKX-NEXT: retq + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { +; AVX1-LABEL: store_v2i32_v2i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_v2i32_v2i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: store_v2i32_v2i32: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: store_v2i32_v2i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; SKX-NEXT: retq + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +define void @const_store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { +; AVX1-LABEL: const_store_v4i32_v4i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: const_store_v4i32_v4i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: const_store_v4i32_v4i32: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: movw $15, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: const_store_v4i32_v4i32: +; SKX: ## %bb.0: +; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} +; SKX-NEXT: retq + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>) + ret void +} + +; When only one element of the mask is set, reduce to a scalar store. + +define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) { +; AVX-LABEL: one_mask_bit_set1: +; AVX: ## %bb.0: +; AVX-NEXT: vmovss %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set1: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovss %xmm0, (%rdi) +; AVX512-NEXT: retq + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) + ret void +} + +; Choose a different element to show that the correct address offset is produced. + +define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) { +; AVX-LABEL: one_mask_bit_set2: +; AVX: ## %bb.0: +; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set2: +; AVX512: ## %bb.0: +; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX512-NEXT: retq + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) + ret void +} + +; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. + +define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { +; AVX-LABEL: one_mask_bit_set3: +; AVX: ## %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovlps %xmm0, 16(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set3: +; AVX512: ## %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, 16(%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) + ret void +} + +; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly. + +define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { +; AVX-LABEL: one_mask_bit_set4: +; AVX: ## %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovhpd %xmm0, 24(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set4: +; AVX512: ## %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) + ret void +} + +; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected. + +define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { +; AVX-LABEL: one_mask_bit_set5: +; AVX: ## %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set5: +; AVX512: ## %bb.0: +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) + ret void +} + +; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed. +; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that. + +define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) { +; AVX-LABEL: trunc_mask: +; AVX: ## %bb.0: +; AVX-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_mask: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: trunc_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpcmpgtd %xmm2, %xmm1, %k1 +; SKX-NEXT: vmovups %xmm0, (%rdi) {%k1} +; SKX-NEXT: retq + %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask) + ret void +} + +; SimplifyDemandedBits eliminates an ashr here. + +define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x double>* %p, <4 x i32> %masksrc) { +; AVX1-LABEL: masked_store_bool_mask_demand_trunc_sext: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: masked_store_bool_mask_demand_trunc_sext: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: masked_store_bool_mask_demand_trunc_sext: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vmovupd %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: masked_store_bool_mask_demand_trunc_sext: +; SKX: ## %bb.0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 +; SKX-NEXT: vmovupd %ymm0, (%rdi) {%k1} +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %sext = sext <4 x i32> %masksrc to <4 x i64> + %boolmask = trunc <4 x i64> %sext to <4 x i1> + call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %x, <4 x double>* %p, i32 4, <4 x i1> %boolmask) + ret void +} + +; This needs to be widened to v4i32. +; This used to assert in type legalization. PR38436 +; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask. +define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { +; AVX1-LABEL: widen_masked_store: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: vmovd %esi, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: widen_masked_store: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovd %edx, %xmm1 +; AVX2-NEXT: vmovd %esi, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: widen_masked_store: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; SKX-LABEL: widen_masked_store: +; SKX: ## %bb.0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 +; SKX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vmovdqa32 %xmm1, %xmm1 {%k1} {z} +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 +; SKX-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.store.v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) + ret void +} +declare void @llvm.masked.store.v3i32(<3 x i32>, <3 x i32>*, i32, <3 x i1>) + +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) +