forked from OSchip/llvm-project
[AVX512] Add intrinsics for masked aligned FP loads and stores
Similar to the unaligned cases. Test was generated with update_llc_test_checks.py. Part of <rdar://problem/17688758> llvm-svn: 226296
This commit is contained in:
parent
9b8cfa212c
commit
3e8b22bc1b
|
@ -1363,6 +1363,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
def int_x86_avx512_mask_loadu_pd_512 : GCCBuiltin<"__builtin_ia32_loadupd512_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
|
||||
[IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_ps_512 : GCCBuiltin<"__builtin_ia32_loadaps512_mask">,
|
||||
Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
|
||||
[IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_pd_512 : GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
|
||||
[IntrReadArgMem]>;
|
||||
}
|
||||
|
||||
// Conditional store ops
|
||||
|
@ -1389,6 +1395,14 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
GCCBuiltin<"__builtin_ia32_storeupd512_mask">,
|
||||
Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
|
||||
[IntrReadWriteArgMem]>;
|
||||
def int_x86_avx512_mask_store_ps_512 :
|
||||
GCCBuiltin<"__builtin_ia32_storeaps512_mask">,
|
||||
Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
|
||||
[IntrReadWriteArgMem]>;
|
||||
def int_x86_avx512_mask_store_pd_512 :
|
||||
GCCBuiltin<"__builtin_ia32_storeapd512_mask">,
|
||||
Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
|
||||
[IntrReadWriteArgMem]>;
|
||||
def int_x86_avx512_mask_store_ss :
|
||||
GCCBuiltin<"__builtin_ia32_storess_mask">,
|
||||
Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
|
||||
|
|
|
@ -2200,6 +2200,22 @@ def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
|
|||
(bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
|
||||
(VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
|
||||
(bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
|
||||
(VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
|
||||
(bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
|
||||
(VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
|
||||
(bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
|
||||
(VMOVAPDZrm addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
|
||||
(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
|
||||
(VMOVAPSZrm addr:$ptr)>;
|
||||
|
||||
def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
|
||||
GR16:$mask),
|
||||
(VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
|
||||
|
@ -2209,6 +2225,15 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
|
|||
(VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
|
||||
VR512:$src)>;
|
||||
|
||||
def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src),
|
||||
GR16:$mask),
|
||||
(VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
|
||||
VR512:$src)>;
|
||||
def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
|
||||
GR8:$mask),
|
||||
(VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
|
||||
VR512:$src)>;
|
||||
|
||||
def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
|
||||
(VMOVUPSZmrk addr:$ptr,
|
||||
(v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
|
||||
|
|
|
@ -551,7 +551,73 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
|
|||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
|
||||
declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
|
||||
|
||||
define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
|
||||
; CHECK-LABEL: test_mask_store_aligned_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
|
||||
|
||||
define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_store_aligned_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
|
||||
|
||||
define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
|
||||
; CHECK-LABEL: test_maskz_load_aligned_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
|
||||
|
||||
define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_maskz_load_aligned_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
|
||||
ret <8 x double> %res
|
||||
}
|
||||
|
||||
declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
|
||||
|
||||
define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
|
||||
; CHECK-LABEL: test_load_aligned_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_load_aligned_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdi), %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
|
||||
ret <8 x double> %res
|
||||
}
|
||||
|
||||
define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) {
|
||||
; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1]
|
||||
|
|
Loading…
Reference in New Issue