forked from OSchip/llvm-project
AVX-512 set: Added BROADCAST instructions
with lowering logic and a test. llvm-svn: 187884
This commit is contained in:
parent
0897fce2f4
commit
45c54ad8dc
|
@ -5406,7 +5406,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
|
|||
MVT VT = Op.getValueType().getSimpleVT();
|
||||
SDLoc dl(Op);
|
||||
|
||||
assert((VT.is128BitVector() || VT.is256BitVector()) &&
|
||||
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
|
||||
"Unsupported vector type for broadcast.");
|
||||
|
||||
SDValue Ld;
|
||||
|
@ -5462,13 +5462,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
|
|||
// The scalar_to_vector node and the suspected
|
||||
// load node must have exactly one user.
|
||||
// Constants may have multiple users.
|
||||
if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
|
||||
|
||||
// AVX-512 has register version of the broadcast
|
||||
bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
|
||||
Ld.getValueType().getSizeInBits() >= 32;
|
||||
if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
|
||||
!hasRegVer))
|
||||
return SDValue();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool Is256 = VT.is256BitVector();
|
||||
bool IsGE256 = (VT.getSizeInBits() >= 256);
|
||||
|
||||
// Handle the broadcasting a single constant scalar from the constant pool
|
||||
// into a vector. On Sandybridge it is still better to load a constant vector
|
||||
|
@ -5478,7 +5483,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
|
|||
assert(!CVT.isVector() && "Must not broadcast a vector type");
|
||||
unsigned ScalarSize = CVT.getSizeInBits();
|
||||
|
||||
if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
|
||||
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
|
||||
const Constant *C = 0;
|
||||
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
|
||||
C = CI->getConstantIntValue();
|
||||
|
@ -5502,14 +5507,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
|
|||
|
||||
// Handle AVX2 in-register broadcasts.
|
||||
if (!IsLoad && Subtarget->hasInt256() &&
|
||||
(ScalarSize == 32 || (Is256 && ScalarSize == 64)))
|
||||
(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
|
||||
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
|
||||
|
||||
// The scalar source must be a normal load.
|
||||
if (!IsLoad)
|
||||
return SDValue();
|
||||
|
||||
if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
|
||||
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
|
||||
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
|
||||
|
||||
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
|
||||
|
@ -13230,6 +13235,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
|
||||
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
|
||||
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
|
||||
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
|
||||
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
|
||||
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
|
||||
case X86ISD::VPERMV: return "X86ISD::VPERMV";
|
||||
|
|
|
@ -321,6 +321,8 @@ namespace llvm {
|
|||
VPERMI,
|
||||
VPERM2X128,
|
||||
VBROADCAST,
|
||||
// masked broadcast
|
||||
VBROADCASTM,
|
||||
|
||||
// PMULUDQ - Vector multiply packed unsigned doubleword integers
|
||||
PMULUDQ,
|
||||
|
@ -852,7 +854,9 @@ namespace llvm {
|
|||
SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
|
|
@ -347,6 +347,132 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
|
|||
[(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
|
||||
addr:$dst)]>, EVEX;
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
// AVX-512 BROADCAST
|
||||
//---
|
||||
multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr,
|
||||
RegisterClass DestRC,
|
||||
RegisterClass SrcRC, X86MemOperand x86memop> {
|
||||
def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[]>, EVEX;
|
||||
def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX;
|
||||
}
|
||||
let ExeDomain = SSEPackedSingle in {
|
||||
defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512,
|
||||
VR128X, f32mem>,
|
||||
EVEX_V512, EVEX_CD8<32, CD8VT1>;
|
||||
}
|
||||
|
||||
let ExeDomain = SSEPackedDouble in {
|
||||
defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512,
|
||||
VR128X, f64mem>,
|
||||
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
|
||||
}
|
||||
|
||||
def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
|
||||
(VBROADCASTSSZrm addr:$src)>;
|
||||
def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
|
||||
(VBROADCASTSDZrm addr:$src)>;
|
||||
|
||||
multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
|
||||
RegisterClass SrcRC, RegisterClass KRC> {
|
||||
def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[]>, EVEX, EVEX_V512;
|
||||
def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst),
|
||||
(ins KRC:$mask, SrcRC:$src),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
|
||||
[]>, EVEX, EVEX_V512, EVEX_KZ;
|
||||
}
|
||||
|
||||
defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>;
|
||||
defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>,
|
||||
VEX_W;
|
||||
|
||||
def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
|
||||
(VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
|
||||
|
||||
def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
|
||||
(VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
|
||||
|
||||
def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
|
||||
(VPBROADCASTDrZrr GR32:$src)>;
|
||||
def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
|
||||
(VPBROADCASTQrZrr GR64:$src)>;
|
||||
|
||||
multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
|
||||
X86MemOperand x86memop, PatFrag ld_frag,
|
||||
RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
|
||||
RegisterClass KRC> {
|
||||
def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set DstRC:$dst,
|
||||
(OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
|
||||
def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
|
||||
VR128X:$src),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
|
||||
[(set DstRC:$dst,
|
||||
(OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
|
||||
EVEX, EVEX_KZ;
|
||||
def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[(set DstRC:$dst,
|
||||
(OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
|
||||
def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
|
||||
x86memop:$src),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
|
||||
[(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask,
|
||||
(ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
|
||||
}
|
||||
|
||||
defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
|
||||
loadi32, VR512, v16i32, v4i32, VK16WM>,
|
||||
EVEX_V512, EVEX_CD8<32, CD8VT1>;
|
||||
defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
|
||||
loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W,
|
||||
EVEX_CD8<64, CD8VT1>;
|
||||
|
||||
def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
|
||||
(VBROADCASTSSZrr VR128X:$src)>;
|
||||
def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
|
||||
(VBROADCASTSDZrr VR128X:$src)>;
|
||||
|
||||
// Provide fallback in case the load node that is used in the patterns above
|
||||
// is used by additional users, which prevents the pattern selection.
|
||||
def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
|
||||
(VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
|
||||
def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
|
||||
(VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
|
||||
|
||||
|
||||
let Predicates = [HasAVX512] in {
|
||||
def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))),
|
||||
(EXTRACT_SUBREG
|
||||
(v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
|
||||
addr:$src)), sub_ymm)>;
|
||||
}
|
||||
//===----------------------------------------------------------------------===//
|
||||
// AVX-512 BROADCAST MASK TO VECTOR REGISTER
|
||||
//---
|
||||
|
||||
multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
|
||||
RegisterClass DstRC, RegisterClass KRC,
|
||||
ValueType OpVT, ValueType SrcVT> {
|
||||
def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
|
||||
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
|
||||
[]>, EVEX;
|
||||
}
|
||||
|
||||
defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
|
||||
VK16, v16i32, v16i1>, EVEX_V512;
|
||||
defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
|
||||
VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
|
||||
|
||||
// Mask register copy, including
|
||||
// - copy between mask registers
|
||||
// - load/store mask registers
|
||||
|
|
|
@ -157,7 +157,9 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
|
|||
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<0,2>, SDTCisInt<3>]>;
|
||||
|
||||
def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
|
||||
def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
|
||||
def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
|
||||
|
||||
def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
|
||||
|
||||
|
@ -196,6 +198,7 @@ def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
|
|||
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
|
||||
|
||||
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
|
||||
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
|
||||
|
||||
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
|
||||
def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
|
||||
|
||||
;CHECK: _inreg16xi32
|
||||
;CHECK: vpbroadcastd {{.*}}, %zmm
|
||||
;CHECK: ret
|
||||
define <16 x i32> @_inreg16xi32(i32 %a) {
|
||||
%b = insertelement <16 x i32> undef, i32 %a, i32 0
|
||||
%c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
|
||||
ret <16 x i32> %c
|
||||
}
|
||||
|
||||
;CHECK: _inreg8xi64
|
||||
;CHECK: vpbroadcastq {{.*}}, %zmm
|
||||
;CHECK: ret
|
||||
define <8 x i64> @_inreg8xi64(i64 %a) {
|
||||
%b = insertelement <8 x i64> undef, i64 %a, i32 0
|
||||
%c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
|
||||
ret <8 x i64> %c
|
||||
}
|
||||
|
||||
;CHECK: _inreg16xfloat
|
||||
;CHECK: vbroadcastssz {{.*}}, %zmm
|
||||
;CHECK: ret
|
||||
define <16 x float> @_inreg16xfloat(float %a) {
|
||||
%b = insertelement <16 x float> undef, float %a, i32 0
|
||||
%c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
|
||||
ret <16 x float> %c
|
||||
}
|
||||
|
||||
;CHECK: _inreg8xdouble
|
||||
;CHECK: vbroadcastsdz {{.*}}, %zmm
|
||||
;CHECK: ret
|
||||
define <8 x double> @_inreg8xdouble(double %a) {
|
||||
%b = insertelement <8 x double> undef, double %a, i32 0
|
||||
%c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
ret <8 x double> %c
|
||||
}
|
Loading…
Reference in New Issue