[X86] Allow masked VBROADCAST instructions to be turned into BLENDM with a broadcast load to avoid a copy.

The BLENDM instructions allow an 2 sources and an independent
destination while masked VBROADCAST has the destination tied
to the source.

llvm-svn: 372068
This commit is contained in:
Craig Topper 2019-09-17 04:41:10 +00:00
parent 2cc57bedd5
commit 769dd59a27
3 changed files with 159 additions and 80 deletions

View File

@ -1123,50 +1123,103 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
X86VectorVTInfo MaskInfo,
X86VectorVTInfo DestInfo,
X86VectorVTInfo SrcInfo,
bit IsConvertibleToThreeAddress,
SDPatternOperator UnmaskedOp = X86VBroadcast> {
let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in {
defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo,
(outs MaskInfo.RC:$dst),
(ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))),
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>,
T8PD, EVEX, Sched<[SchedRR]>;
let mayLoad = 1 in
defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo,
(outs MaskInfo.RC:$dst),
(ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
(MaskInfo.VT
(bitconvert
(DestInfo.VT (UnmaskedOp
(SrcInfo.ScalarLdFrag addr:$src))))),
(MaskInfo.VT
(bitconvert
(DestInfo.VT (X86VBroadcast
(SrcInfo.ScalarLdFrag addr:$src)))))>,
T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
Sched<[SchedRM]>;
}
let hasSideEffects = 0 in
def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set MaskInfo.RC:$dst,
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
(ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"),
[(set MaskInfo.RC:$dst,
(vselect MaskInfo.KRCWM:$mask,
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
MaskInfo.ImmAllZerosV))],
DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
let Constraints = "$src0 = $dst" in
def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
(ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
SrcInfo.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
"${dst} {${mask}}, $src}"),
[(set MaskInfo.RC:$dst,
(vselect MaskInfo.KRCWM:$mask,
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
MaskInfo.RC:$src0))],
DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
let hasSideEffects = 0, mayLoad = 1 in
def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
(ins SrcInfo.ScalarMemOp:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set MaskInfo.RC:$dst,
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(UnmaskedOp (SrcInfo.ScalarLdFrag addr:$src))))))],
DestInfo.ExeDomain>, T8PD, EVEX,
EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
(ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"),
[(set MaskInfo.RC:$dst,
(vselect MaskInfo.KRCWM:$mask,
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(X86VBroadcast (SrcInfo.ScalarLdFrag addr:$src))))),
MaskInfo.ImmAllZerosV))],
DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
let Constraints = "$src0 = $dst",
isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
(ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
SrcInfo.ScalarMemOp:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
"${dst} {${mask}}, $src}"),
[(set MaskInfo.RC:$dst,
(vselect MaskInfo.KRCWM:$mask,
(MaskInfo.VT
(bitconvert
(DestInfo.VT
(X86VBroadcast (SrcInfo.ScalarLdFrag addr:$src))))),
MaskInfo.RC:$src0))],
DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
}
// Helper class to force mask and broadcast result to same type.
multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
SchedWrite SchedRR, SchedWrite SchedRM,
X86VectorVTInfo DestInfo,
X86VectorVTInfo SrcInfo> :
X86VectorVTInfo SrcInfo,
bit IsConvertibleToThreeAddress> :
avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
DestInfo, DestInfo, SrcInfo>;
DestInfo, DestInfo, SrcInfo,
IsConvertibleToThreeAddress>;
multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info512, _.info128>,
WriteFShuffle256Ld, _.info512, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
_.info128>,
EVEX_V512;
@ -1174,7 +1227,7 @@ multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
let Predicates = [HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info256, _.info128>,
WriteFShuffle256Ld, _.info256, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
_.info128>,
EVEX_V256;
@ -1185,7 +1238,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info512, _.info128>,
WriteFShuffle256Ld, _.info512, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
_.info128>,
EVEX_V512;
@ -1193,12 +1246,12 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
let Predicates = [HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info256, _.info128>,
WriteFShuffle256Ld, _.info256, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
_.info128>,
EVEX_V256;
defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
WriteFShuffle256Ld, _.info128, _.info128>,
WriteFShuffle256Ld, _.info128, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
_.info128>,
EVEX_V128;
@ -1283,30 +1336,34 @@ defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
X86VBroadcast, GR64, HasAVX512>, VEX_W;
multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd> {
AVX512VLVectorVTInfo _, Predicate prd,
bit IsConvertibleToThreeAddress> {
let Predicates = [prd] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _.info512, _.info128>,
WriteShuffle256Ld, _.info512, _.info128,
IsConvertibleToThreeAddress>,
EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _.info256, _.info128>,
WriteShuffle256Ld, _.info256, _.info128,
IsConvertibleToThreeAddress>,
EVEX_V256;
defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
WriteShuffleXLd, _.info128, _.info128>,
WriteShuffleXLd, _.info128, _.info128,
IsConvertibleToThreeAddress>,
EVEX_V128;
}
}
defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
avx512vl_i8_info, HasBWI>;
avx512vl_i8_info, HasBWI, 0>;
defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
avx512vl_i16_info, HasBWI>;
avx512vl_i16_info, HasBWI, 0>;
defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
avx512vl_i32_info, HasAVX512>;
avx512vl_i32_info, HasAVX512, 1>;
defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
avx512vl_i64_info, HasAVX512>, VEX_W1X;
avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
@ -1612,12 +1669,12 @@ multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
let Predicates = [HasDQI] in
defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info512,
_Src.info512, _Src.info128, null_frag>,
_Src.info512, _Src.info128, 0, null_frag>,
EVEX_V512;
let Predicates = [HasDQI, HasVLX] in
defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info256,
_Src.info256, _Src.info128, null_frag>,
_Src.info256, _Src.info128, 0, null_frag>,
EVEX_V256;
}
@ -1628,7 +1685,7 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
let Predicates = [HasDQI, HasVLX] in
defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
WriteShuffleXLd, _Dst.info128,
_Src.info128, _Src.info128, null_frag>,
_Src.info128, _Src.info128, 0, null_frag>,
EVEX_V128;
}
@ -1913,7 +1970,7 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
}
multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let mayLoad = 1, hasSideEffects = 0 in {
let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,

View File

@ -1177,40 +1177,62 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: {
case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
case X86::VBROADCASTSDZ256mk:
case X86::VBROADCASTSDZmk:
case X86::VBROADCASTSSZ128mk:
case X86::VBROADCASTSSZ256mk:
case X86::VBROADCASTSSZmk:
case X86::VPBROADCASTDZ128mk:
case X86::VPBROADCASTDZ256mk:
case X86::VPBROADCASTDZmk:
case X86::VPBROADCASTQZ128mk:
case X86::VPBROADCASTQZ256mk:
case X86::VPBROADCASTQZmk: {
unsigned Opc;
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break;
case X86::VBROADCASTSDZmk: Opc = X86::VBLENDMPDZrmbk; break;
case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break;
case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break;
case X86::VBROADCASTSSZmk: Opc = X86::VBLENDMPSZrmbk; break;
case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break;
case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break;
case X86::VPBROADCASTDZmk: Opc = X86::VPBLENDMDZrmbk; break;
case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break;
case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break;
case X86::VPBROADCASTQZmk: Opc = X86::VPBLENDMQZrmbk; break;
}
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@ -1224,6 +1246,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
.add(MI.getOperand(7));
break;
}
case X86::VMOVDQU8Z128rrk:
case X86::VMOVDQU8Z256rrk:
case X86::VMOVDQU8Zrrk:

View File

@ -4458,8 +4458,7 @@ define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB126_1: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vcmpgtps 4096(%rdi,%rax), %ymm0, %k1
; CHECK-NEXT: vmovaps %ymm1, %ymm2
; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 {%k1}
; CHECK-NEXT: vblendmps {{.*}}(%rip){1to8}, %ymm1, %ymm2 {%k1}
; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB126_1