[AVX-512] Add scalar masked add/sub/mul/div intrinsic instructions to the load folding tables.

llvm-svn: 294152
This commit is contained in:
Craig Topper 2017-02-05 22:25:42 +00:00
parent 59af67206d
commit cb4bc8be5b
2 changed files with 86 additions and 0 deletions

View File

@ -2315,6 +2315,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// AVX-512 masked instructions
{ X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
{ X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
{ X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE },
{ X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE },
{ X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
{ X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
{ X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
@ -2323,6 +2325,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
{ X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
{ X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
{ X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE },
{ X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE },
{ X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
{ X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
{ X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
@ -2341,6 +2345,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
{ X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
{ X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
{ X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE },
{ X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE },
{ X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
{ X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
{ X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
@ -2389,6 +2395,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
{ X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
{ X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
{ X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE },
{ X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE },
{ X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
{ X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
{ X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
@ -2642,6 +2650,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// AVX-512 foldable masked instructions
{ X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
{ X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
{ X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE },
{ X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE },
{ X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
{ X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
{ X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
@ -2650,6 +2660,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
{ X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
{ X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
{ X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE },
{ X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE },
{ X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
{ X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
{ X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
@ -2668,6 +2680,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
{ X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
{ X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
{ X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE },
{ X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE },
{ X86::VORPDZrrk, X86::VORPDZrmk, 0 },
{ X86::VORPSZrrk, X86::VORPSZrmk, 0 },
{ X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
@ -2729,6 +2743,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
{ X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
{ X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
{ X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE },
{ X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE },
{ X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
{ X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
{ X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
@ -7726,6 +7742,10 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
@ -7771,6 +7791,10 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:

View File

@ -2301,6 +2301,39 @@ define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
ret <4 x float> %res
}
define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_current_memfold:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%a1.val = load float, float* %a1
%a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
%a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
%a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
%a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
ret <4 x float> %res
}
define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_ss_current_memfold:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%a1.val = load float, float* %a1
%a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
%a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
%a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
%a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
ret <4 x float> %res
}
declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
@ -2383,6 +2416,35 @@ define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
ret <2 x double> %res
}
define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_current_memfold:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: retq
%a1.val = load double, double* %a1
%a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
%a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
ret <2 x double> %res
}
define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_sd_current_memfold:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
%a1.val = load double, double* %a1
%a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
%a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
ret <2 x double> %res
}
declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {