[X86][AVX512] Tag VPCMP/VPCMPU instruction scheduler classes

Move hardcoded itinerary out to the instruction declarations. Not sure that IIC_SSE_ALU_F32P is the best schedule for integer comparisons, but I'm not going to change it right now.

llvm-svn: 319760
This commit is contained in:
Simon Pilgrim 2017-12-05 12:14:36 +00:00
parent c360f41097
commit aa91155960
2 changed files with 100 additions and 82 deletions

View File

@ -2092,7 +2092,7 @@ def CommutePCMPCC : SDNodeXForm<imm, [{
}]>;
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
X86VectorVTInfo _> {
OpndItins itins, X86VectorVTInfo _> {
let isCommutable = 1 in
def rri : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
@ -2100,7 +2100,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
imm:$cc))],
IIC_SSE_ALU_F32P_RR>, EVEX_4V;
itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
def rmi : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
!strconcat("vpcmp${cc}", Suffix,
@ -2108,7 +2108,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
imm:$cc))],
IIC_SSE_ALU_F32P_RM>, EVEX_4V;
itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let isCommutable = 1 in
def rrik : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
@ -2119,7 +2119,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
imm:$cc)))],
IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
def rmik : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
AVX512ICC:$cc),
@ -2130,7 +2130,8 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
imm:$cc)))],
IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
itins.rm>, EVEX_4V, EVEX_K,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
@ -2138,20 +2139,20 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
"$dst, $src1, $src2, $cc}"),
[], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
[], itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
"$dst, $src1, $src2, $cc}"),
[], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
[], itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, $src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2, $cc}"),
[], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
[], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
@ -2159,7 +2160,8 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
!strconcat("vpcmp", Suffix,
"\t{$cc, $src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2, $cc}"),
[], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
[], itins.rm>, EVEX_4V, EVEX_K,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)),
@ -2175,8 +2177,8 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
}
multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
X86VectorVTInfo _> :
avx512_icmp_cc<opc, Suffix, OpNode, _> {
OpndItins itins, X86VectorVTInfo _> :
avx512_icmp_cc<opc, Suffix, OpNode, itins, _> {
def rmib : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
AVX512ICC:$cc),
@ -2186,7 +2188,8 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
imm:$cc))],
IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
itins.rm>, EVEX_4V, EVEX_B,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2, AVX512ICC:$cc),
@ -2197,7 +2200,8 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
imm:$cc)))],
IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
@ -2207,14 +2211,16 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
!strconcat("vpcmp", Suffix,
"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
[], itins.rm>, EVEX_4V, EVEX_B,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
[], itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
@ -2231,49 +2237,61 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
}
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
OpndItins itins, AVX512VLVectorVTInfo VTInfo,
Predicate prd> {
let Predicates = [prd] in
defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;
defm Z : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info512>,
EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info256>,
EVEX_V256;
defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info128>,
EVEX_V128;
}
}
multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
OpndItins itins, AVX512VLVectorVTInfo VTInfo,
Predicate prd> {
let Predicates = [prd] in
defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info512>,
EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info256>,
EVEX_V256;
defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info128>,
EVEX_V128;
}
}
defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
HasBWI>, EVEX_CD8<8, CD8VF>;
defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
HasBWI>, EVEX_CD8<8, CD8VF>;
// FIXME: Is there a better scheduler itinerary for VPCMP/VPCMPU?
defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SSE_ALU_F32P,
avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SSE_ALU_F32P,
avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SSE_ALU_F32P,
avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SSE_ALU_F32P,
avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
HasAVX512>, EVEX_CD8<32, CD8VF>;
defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SSE_ALU_F32P,
avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SSE_ALU_F32P,
avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SSE_ALU_F32P,
avx512vl_i64_info, HasAVX512>,
VEX_W, EVEX_CD8<64, CD8VF>;
defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SSE_ALU_F32P,
avx512vl_i64_info, HasAVX512>,
VEX_W, EVEX_CD8<64, CD8VF>;
multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {

View File

@ -401,7 +401,7 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma
; GENERIC-LABEL: vpaddd_mask_test:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -421,7 +421,7 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m
; GENERIC-LABEL: vpaddd_maskz_test:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -441,7 +441,7 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
; GENERIC-LABEL: vpaddd_mask_fold_test:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -462,7 +462,7 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1)
; GENERIC-LABEL: vpaddd_mask_broadcast_test:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -482,7 +482,7 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
; GENERIC-LABEL: vpaddd_maskz_fold_test:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -503,7 +503,7 @@ define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1)
; GENERIC-LABEL: vpaddd_maskz_broadcast_test:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -718,7 +718,7 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
; GENERIC-LABEL: test_mask_vaddps:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -740,7 +740,7 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x
; GENERIC-LABEL: test_mask_vmulps:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -760,7 +760,7 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x
; GENERIC-LABEL: test_mask_vminps:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -781,7 +781,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x d
; GENERIC-LABEL: test_mask_vminpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -802,7 +802,7 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x
; GENERIC-LABEL: test_mask_vmaxps:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -823,7 +823,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x d
; GENERIC-LABEL: test_mask_vmaxpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -844,7 +844,7 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x
; GENERIC-LABEL: test_mask_vsubps:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -864,7 +864,7 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x
; GENERIC-LABEL: test_mask_vdivps:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -884,7 +884,7 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x d
; GENERIC-LABEL: test_mask_vaddpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1
; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -904,7 +904,7 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i6
; GENERIC-LABEL: test_maskz_vaddpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1
; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -924,7 +924,7 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <
; GENERIC-LABEL: test_mask_fold_vaddpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1
; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -945,7 +945,7 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, <
; GENERIC-LABEL: test_maskz_fold_vaddpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -984,7 +984,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
; GENERIC-LABEL: test_mask_broadcast_vaddpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1
; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00]
; GENERIC-NEXT: vmovapd %zmm1, %zmm0
; GENERIC-NEXT: retq # sched: [1:1.00]
@ -1010,7 +1010,7 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
; GENERIC-LABEL: test_maskz_broadcast_vaddpd:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -2965,7 +2965,7 @@ define <2 x float> @ubto2f32(<2 x i32> %a) {
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50]
; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@ -2988,7 +2988,7 @@ define <2 x double> @ubto2f64(<2 x i32> %a) {
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50]
; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
@ -4423,7 +4423,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; GENERIC-LABEL: sext_8i1_8i32:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0
; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00]
; GENERIC-NEXT: vpmovm2d %k0, %ymm0
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6402,7 +6402,7 @@ define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test32:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6423,7 +6423,7 @@ define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test33:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6444,7 +6444,7 @@ define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test34:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6465,7 +6465,7 @@ define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) {
; GENERIC-LABEL: mov_test35:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6486,7 +6486,7 @@ define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test36:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6507,7 +6507,7 @@ define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test37:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6528,7 +6528,7 @@ define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test38:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1
; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6549,7 +6549,7 @@ define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) {
; GENERIC-LABEL: mov_test39:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1
; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -6947,7 +6947,7 @@ define i8 @shuf_test1(i16 %v) nounwind {
define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
; GENERIC-LABEL: zext_test1:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
; GENERIC-NEXT: kshiftlw $10, %k0, %k0
; GENERIC-NEXT: kshiftrw $15, %k0, %k0
; GENERIC-NEXT: kmovd %k0, %eax
@ -6973,7 +6973,7 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
; GENERIC-LABEL: zext_test2:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
; GENERIC-NEXT: kshiftlw $10, %k0, %k0
; GENERIC-NEXT: kshiftrw $15, %k0, %k0
; GENERIC-NEXT: kmovd %k0, %eax
@ -7001,7 +7001,7 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
; GENERIC-LABEL: zext_test3:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
; GENERIC-NEXT: kshiftlw $10, %k0, %k0
; GENERIC-NEXT: kshiftrw $15, %k0, %k0
; GENERIC-NEXT: kmovd %k0, %eax
@ -7150,7 +7150,7 @@ define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33]
; GENERIC-NEXT: jg .LBB386_1 # sched: [1:1.00]
; GENERIC-NEXT: # %bb.2:
; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00]
; GENERIC-NEXT: vpmovm2b %k0, %xmm0
; GENERIC-NEXT: vzeroupper
; GENERIC-NEXT: retq # sched: [1:1.00]
@ -8339,7 +8339,7 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m
; GENERIC-LABEL: _ss16xfloat_mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
; GENERIC-NEXT: vmovaps %zmm1, %zmm0
; GENERIC-NEXT: retq # sched: [1:1.00]
@ -8362,7 +8362,7 @@ define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
; GENERIC-LABEL: _ss16xfloat_maskz:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -8399,7 +8399,7 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16
; GENERIC-LABEL: _ss16xfloat_mask_load:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1}
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -8421,7 +8421,7 @@ define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1)
; GENERIC-LABEL: _ss16xfloat_maskz_load:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z}
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -8458,7 +8458,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m
; GENERIC-LABEL: _sd8xdouble_mask:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1
; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
; GENERIC-NEXT: vmovapd %zmm1, %zmm0
; GENERIC-NEXT: retq # sched: [1:1.00]
@ -8481,7 +8481,7 @@ define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
; GENERIC-LABEL: _sd8xdouble_maskz:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1
; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -8518,7 +8518,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8
; GENERIC-LABEL: _sd8xdouble_mask_load:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1
; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
; GENERIC-NEXT: retq # sched: [1:1.00]
;
@ -8540,7 +8540,7 @@ define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1)
; GENERIC-LABEL: _sd8xdouble_maskz_load:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1
; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
; GENERIC-NEXT: retq # sched: [1:1.00]
;