llvm-project/llvm/test/CodeGen/X86/masked_memop.ll

1335 lines
50 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX
; To test for the case where masked load/store is not legal, we should add a run with a target
; that does not have AVX, but that case should probably be a separate test file using less tests
; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
; AVX-LABEL: loadv1:
; AVX: ## %bb.0:
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: ## implicit-def: $xmm1
; AVX-NEXT: je LBB0_1
; AVX-NEXT: ## %bb.2: ## %else
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: jne LBB0_3
; AVX-NEXT: LBB0_4: ## %else
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
; AVX-NEXT: LBB0_1: ## %cond.load
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: je LBB0_4
; AVX-NEXT: LBB0_3: ## %else
; AVX-NEXT: vmovaps %xmm0, %xmm1
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: loadv1:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: testq %rdi, %rdi
; AVX512F-NEXT: ## implicit-def: $xmm1
; AVX512F-NEXT: jne LBB0_2
; AVX512F-NEXT: ## %bb.1: ## %cond.load
; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: LBB0_2: ## %else
; AVX512F-NEXT: testq %rdi, %rdi
; AVX512F-NEXT: sete %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512F-NEXT: retq
;
; SKX-LABEL: loadv1:
; SKX: ## %bb.0:
; SKX-NEXT: testq %rdi, %rdi
; SKX-NEXT: ## implicit-def: $xmm1
; SKX-NEXT: jne LBB0_2
; SKX-NEXT: ## %bb.1: ## %cond.load
; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; SKX-NEXT: LBB0_2: ## %else
; SKX-NEXT: testq %rdi, %rdi
; SKX-NEXT: sete %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <1 x i64> %trigger, zeroinitializer
%res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
ret <1 x double> %res
}
declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
define void @storev1(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) {
; AVX-LABEL: storev1:
; AVX: ## %bb.0:
; AVX-NEXT: testl %edi, %edi
; AVX-NEXT: je LBB1_1
; AVX-NEXT: ## %bb.2: ## %else
; AVX-NEXT: retq
; AVX-NEXT: LBB1_1: ## %cond.store
; AVX-NEXT: movl %edx, (%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: storev1:
; AVX512: ## %bb.0:
; AVX512-NEXT: testl %edi, %edi
; AVX512-NEXT: je LBB1_1
; AVX512-NEXT: ## %bb.2: ## %else
; AVX512-NEXT: retq
; AVX512-NEXT: LBB1_1: ## %cond.store
; AVX512-NEXT: movl %edx, (%rsi)
; AVX512-NEXT: retq
%mask = icmp eq <1 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask)
ret void
}
declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>)
define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
; AVX-LABEL: test6:
; AVX: ## %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: test6:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test6:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
ret <2 x double> %res
}
define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
; AVX-LABEL: test7:
; AVX: ## %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: test7:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test7:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
ret <4 x float> %res
}
define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; AVX1-LABEL: test8:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test8:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test8:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test8:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
ret <4 x i32> %res
}
define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
; AVX1-LABEL: test9:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test9:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test9:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test9:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
ret void
}
define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
; AVX1-LABEL: test10:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test10:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test10:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test10:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
ret <4 x double> %res
}
define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
; AVX1-LABEL: test10b:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test10b:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test10b:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test10b:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %xmm0, %xmm0, %k1
; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
ret <4 x double> %res
}
define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
; AVX1-LABEL: test11a:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11a:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11a:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11a:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1
; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
ret <8 x float> %res
}
define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX1-LABEL: test11b:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11b:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11b:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11b:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
ret <8 x i32> %res
}
define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
; AVX1-LABEL: test11c:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11c:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11c:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11c:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
ret <8 x float> %res
}
define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX1-LABEL: test11d:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11d:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11d:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11d:
; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
ret <8 x i32> %res
}
define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
; AVX1-LABEL: test12:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test12:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test12:
; SKX: ## %bb.0:
; SKX-NEXT: vptestnmd %ymm0, %ymm0, %k1
; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
ret void
}
define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; AVX1-LABEL: test14:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test14:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test14:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test14:
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
ret void
}
define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; AVX1-LABEL: test15:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test15:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test15:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test15:
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
ret void
}
define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
; AVX1-LABEL: test16:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test16:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test16:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test16:
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
ret <2 x float> %res
}
define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; AVX1-LABEL: test17:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test17:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test17:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test17:
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
ret <2 x i32> %res
}
define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
; AVX1-LABEL: test18:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test18:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test18:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test18:
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
ret <2 x float> %res
}
define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
; AVX-LABEL: load_all:
; AVX: ## %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: load_all:
; AVX512F: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movw $15, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: load_all:
; SKX: ## %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
ret <4 x float> %res
}
;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
; 128-bit FP vectors are supported with AVX.
define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
; AVX-LABEL: mload_constmask_v4f32:
; AVX: ## %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movw $13, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4f32:
; SKX: ## %bb.0:
; SKX-NEXT: movb $13, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1}
; SKX-NEXT: retq
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
ret <4 x float> %res
}
define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
; AVX-LABEL: mload_constmask_v2f64:
; AVX: ## %bb.0:
; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
;
; AVX512-LABEL: mload_constmask_v2f64:
; AVX512: ## %bb.0:
; AVX512-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX512-NEXT: retq
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
ret <2 x double> %res
}
; 128-bit integer vectors are supported with AVX2.
define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
; AVX1-LABEL: mload_constmask_v4i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movw $14, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4i32:
; SKX: ## %bb.0:
; SKX-NEXT: movb $14, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
; SKX-NEXT: retq
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
ret <4 x i32> %res
}
define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
; AVX-LABEL: mload_constmask_v2i64:
; AVX: ## %bb.0:
; AVX-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: mload_constmask_v2i64:
; AVX512: ## %bb.0:
; AVX512-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0
; AVX512-NEXT: retq
%res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
ret <2 x i64> %res
}
; 256-bit FP vectors are supported with AVX.
define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
; AVX-LABEL: mload_constmask_v8f32:
; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
; AVX-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: movw $7, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8f32:
; SKX: ## %bb.0:
; SKX-NEXT: movb $7, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovups (%rdi), %ymm0 {%k1}
; SKX-NEXT: retq
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
ret <8 x float> %res
}
define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
; AVX-LABEL: mload_constmask_v4f64:
; AVX: ## %bb.0:
; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movb $7, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4f64:
; SKX: ## %bb.0:
; SKX-NEXT: movb $7, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1}
; SKX-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
ret <4 x double> %res
}
; 256-bit integer vectors are supported with AVX2.
define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
; AVX-LABEL: mload_constmask_v8i32:
; AVX: ## %bb.0:
; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: movw $135, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8i32:
; SKX: ## %bb.0:
; SKX-NEXT: movb $-121, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
; SKX-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
ret <8 x i32> %res
}
define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
; AVX-LABEL: mload_constmask_v4i64:
; AVX: ## %bb.0:
; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movb $9, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4i64:
; SKX: ## %bb.0:
; SKX-NEXT: movb $9, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
; SKX-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
ret <4 x i64> %res
}
; 512-bit FP vectors are supported with AVX512.
define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
; AVX-LABEL: mload_constmask_v8f64:
; AVX: ## %bb.0:
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f64:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: movb $-121, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8f64:
; SKX: ## %bb.0:
; SKX-NEXT: movb $-121, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; SKX-NEXT: retq
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
ret <8 x double> %res
}
; If the pass-through operand is undef, no blend is needed.
define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
; AVX-LABEL: mload_constmask_v4f64_undef_passthrough:
; AVX: ## %bb.0:
; AVX-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
; AVX512F: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movb $7, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
; SKX: ## %bb.0:
; SKX-NEXT: movb $7, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
ret <4 x double> %res
}
define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX1: ## %bb.0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX2: ## %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
; AVX512F: ## %bb.0:
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movb $6, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
; SKX: ## %bb.0:
; SKX-NEXT: movb $6, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
ret <4 x i64> %res
}
define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
; AVX1-LABEL: test21:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test21:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test21:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: movw $15, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test21:
; SKX: ## %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
ret void
}
; When only one element of the mask is set, reduce to a scalar store.
define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; AVX-LABEL: one_mask_bit_set1:
; AVX: ## %bb.0:
; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set1:
; AVX512: ## %bb.0:
; AVX512-NEXT: vmovss %xmm0, (%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
ret void
}
; Choose a different element to show that the correct address offset is produced.
define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
; AVX-LABEL: one_mask_bit_set2:
; AVX: ## %bb.0:
; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set2:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX-LABEL: one_mask_bit_set3:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set3:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; AVX-LABEL: one_mask_bit_set4:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set4:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
ret void
}
; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX-LABEL: one_mask_bit_set5:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set5:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
}
; When only one element of the mask is set, reduce to a scalar load.
define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; AVX-LABEL: load_one_mask_bit_set1:
; AVX: ## %bb.0:
; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set1:
; AVX512: ## %bb.0:
; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; AVX512-NEXT: retq
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
ret <4 x i32> %res
}
; Choose a different element to show that the correct address offset is produced.
define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
; AVX-LABEL: load_one_mask_bit_set2:
; AVX: ## %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set2:
; AVX512: ## %bb.0:
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX512-NEXT: retq
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
ret <4 x float> %res
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX1-LABEL: load_one_mask_bit_set3:
; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_one_mask_bit_set3:
; AVX2: ## %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set3:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
ret <4 x i64> %res
}
; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; AVX-LABEL: load_one_mask_bit_set4:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set4:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
ret <4 x double> %res
}
; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX-LABEL: load_one_mask_bit_set5:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set5:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
ret <8 x double> %res
}
; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
; AVX-LABEL: trunc_mask:
; AVX: ## %bb.0:
; AVX-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_mask:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-08 02:20:37 +08:00
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: trunc_mask:
; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpcmpgtd %xmm2, %xmm1, %k1
; SKX-NEXT: vmovups %xmm0, (%rdi) {%k1}
; SKX-NEXT: retq
%bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
ret void
}
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)