forked from OSchip/llvm-project
[AVX-512] Add scalar masked max/min intrinsic instructions to the load folding tables.
llvm-svn: 294153
This commit is contained in:
parent
cb4bc8be5b
commit
8eb1f315ac
|
@ -2339,10 +2339,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
|||
{ X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
|
||||
{ X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
|
||||
{ X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
|
||||
{ X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, 0 },
|
||||
{ X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, 0 },
|
||||
{ X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
|
||||
{ X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
|
||||
{ X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
|
||||
{ X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
|
||||
{ X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, 0 },
|
||||
{ X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, 0 },
|
||||
{ X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
|
||||
{ X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
|
||||
{ X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE },
|
||||
|
@ -2674,10 +2678,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
|
|||
{ X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
|
||||
{ X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
|
||||
{ X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
|
||||
{ X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, 0 },
|
||||
{ X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, 0 },
|
||||
{ X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
|
||||
{ X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
|
||||
{ X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
|
||||
{ X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
|
||||
{ X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, 0 },
|
||||
{ X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, 0 },
|
||||
{ X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
|
||||
{ X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
|
||||
{ X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE },
|
||||
|
@ -7744,6 +7752,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
|
|||
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
|
||||
case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
|
||||
case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
|
||||
case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
|
||||
case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
|
||||
case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
|
||||
case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
|
||||
case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
|
||||
|
@ -7793,6 +7803,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
|
|||
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
|
||||
case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
|
||||
case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
|
||||
case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
|
||||
case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
|
||||
case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
|
||||
case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
|
||||
case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
|
||||
|
|
|
@ -2510,6 +2510,39 @@ define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
|
|||
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_max_ss_memfold:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %esi
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1}
|
||||
; CHECK-NEXT: vmovaps %xmm1, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%a1.val = load float, float* %a1
|
||||
%a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
|
||||
%a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
|
||||
%a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
|
||||
%a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
|
||||
; CHECK-LABEL: test_maskz_max_ss_memfold:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %esi
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
%a1.val = load float, float* %a1
|
||||
%a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
|
||||
%a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
|
||||
%a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
|
||||
%a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
|
||||
|
||||
define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
|
||||
|
@ -2576,6 +2609,35 @@ define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
|
|||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_max_sd_memfold:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %esi
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
|
||||
; CHECK-NEXT: vmovapd %xmm1, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%a1.val = load double, double* %a1
|
||||
%a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
|
||||
%a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
|
||||
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
|
||||
; CHECK-LABEL: test_maskz_max_sd_memfold:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %esi
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
%a1.val = load double, double* %a1
|
||||
%a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
|
||||
%a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
|
||||
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
|
||||
ret <2 x double> %res
|
||||
}
|
||||
|
||||
define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
|
||||
; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
|
||||
; CHECK: ## BB#0:
|
||||
|
|
Loading…
Reference in New Issue