forked from OSchip/llvm-project
[X86] Make IFMA instructions during isel so we can fold broadcast loads.
This required changing the ISD opcode for these instructions to have the commutable operands first and the addend last. This way tablegen can autogenerate the additional patterns for us. llvm-svn: 314083
This commit is contained in:
parent
4ffd90c504
commit
47e14ead54
|
@ -19580,6 +19580,26 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
|
|||
Src3, Rnd),
|
||||
Mask, PassThru, Subtarget, DAG);
|
||||
}
|
||||
case IFMA_OP_MASKZ:
|
||||
case IFMA_OP_MASK: {
|
||||
SDValue Src1 = Op.getOperand(1);
|
||||
SDValue Src2 = Op.getOperand(2);
|
||||
SDValue Src3 = Op.getOperand(3);
|
||||
SDValue Mask = Op.getOperand(4);
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
SDValue PassThru = Src1;
|
||||
|
||||
// set PassThru element
|
||||
if (IntrData->Type == IFMA_OP_MASKZ)
|
||||
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
|
||||
|
||||
// Node we need to swizzle the operands to pass the multiply operands
|
||||
// first.
|
||||
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
|
||||
dl, Op.getValueType(),
|
||||
Src2, Src3, Src1),
|
||||
Mask, PassThru, Subtarget, DAG);
|
||||
}
|
||||
case TERLOG_OP_MASK:
|
||||
case TERLOG_OP_MASKZ: {
|
||||
SDValue Src1 = Op.getOperand(1);
|
||||
|
|
|
@ -467,6 +467,10 @@ namespace llvm {
|
|||
|
||||
// Multiply and Add Packed Integers.
|
||||
VPMADDUBSW, VPMADDWD,
|
||||
|
||||
// AVX512IFMA multiply and add.
|
||||
// NOTE: These are different than the instruction and perform
|
||||
// op0 x op1 + op2.
|
||||
VPMADD52L, VPMADD52H,
|
||||
|
||||
// FMA nodes.
|
||||
|
|
|
@ -6480,30 +6480,30 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
|
|||
let Constraints = "$src1 = $dst" in {
|
||||
multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
X86VectorVTInfo _> {
|
||||
// NOTE: The SDNode have the multiply operands first with the add last.
|
||||
// This enables commuted load patterns to be autogenerated by tablegen.
|
||||
let ExeDomain = _.ExeDomain in {
|
||||
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
|
||||
(ins _.RC:$src2, _.RC:$src3),
|
||||
OpcodeStr, "$src3, $src2", "$src2, $src3",
|
||||
(_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1, 1>,
|
||||
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
|
||||
AVX512FMA3Base;
|
||||
|
||||
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||
(ins _.RC:$src2, _.MemOp:$src3),
|
||||
OpcodeStr, "$src3, $src2", "$src2, $src3",
|
||||
(_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
|
||||
(_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
|
||||
AVX512FMA3Base;
|
||||
|
||||
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||
(ins _.RC:$src2, _.ScalarMemOp:$src3),
|
||||
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
|
||||
!strconcat("$src2, ${src3}", _.BroadcastStr ),
|
||||
(OpNode _.RC:$src1,
|
||||
_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
|
||||
(OpNode _.RC:$src2,
|
||||
(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
|
||||
_.RC:$src1)>,
|
||||
AVX512FMA3Base, EVEX_B;
|
||||
}
|
||||
|
||||
// TODO: Should be able to match a memory op in operand 2.
|
||||
// TODO: These instructions should be marked Commutable on operand 2 and 3.
|
||||
}
|
||||
} // Constraints = "$src1 = $dst"
|
||||
|
||||
|
|
|
@ -498,8 +498,8 @@ def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound, [SDNPCommut
|
|||
|
||||
def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
|
||||
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma>;
|
||||
def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma>;
|
||||
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
|
||||
def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
|
||||
|
||||
def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
|
||||
def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
|
||||
|
|
|
@ -30,6 +30,7 @@ enum IntrinsicType : uint16_t {
|
|||
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
|
||||
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
|
||||
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
|
||||
IFMA_OP_MASK, IFMA_OP_MASKZ,
|
||||
VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
|
||||
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
|
||||
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST32x2_TO_VEC,
|
||||
|
@ -1208,17 +1209,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86ISD::VPERMV3, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
|
||||
X86ISD::VPERMV3, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , FMA_OP_MASK,
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
|
||||
X86ISD::VPMADD52H, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , FMA_OP_MASK,
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
|
||||
X86ISD::VPMADD52H, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , FMA_OP_MASK,
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
|
||||
X86ISD::VPMADD52H, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , FMA_OP_MASK,
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
|
||||
X86ISD::VPMADD52L, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , FMA_OP_MASK,
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
|
||||
X86ISD::VPMADD52L, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK,
|
||||
X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
|
||||
X86ISD::VPMADD52L, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0),
|
||||
|
@ -1354,17 +1355,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86ISD::VPERMV3, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
|
||||
X86ISD::VPERMV3, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, FMA_OP_MASKZ,
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
|
||||
X86ISD::VPMADD52H, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, FMA_OP_MASKZ,
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
|
||||
X86ISD::VPMADD52H, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, FMA_OP_MASKZ,
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
|
||||
X86ISD::VPMADD52H, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, FMA_OP_MASKZ,
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
|
||||
X86ISD::VPMADD52L, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, FMA_OP_MASKZ,
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
|
||||
X86ISD::VPMADD52L, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
|
||||
X86ISD::VPMADD52L, 0),
|
||||
X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
|
||||
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
|
||||
|
|
|
@ -151,8 +151,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0
|
|||
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm2
|
||||
; CHECK-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0
|
||||
; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
|
||||
%x1load = load i64, i64* %x1ptr
|
||||
|
@ -204,8 +203,7 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8
|
|||
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm2
|
||||
; CHECK-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1}
|
||||
; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
|
||||
%x1load = load i64, i64* %x1ptr
|
||||
|
@ -257,8 +255,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<
|
|||
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vpbroadcastq (%rdi), %zmm2
|
||||
; CHECK-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
|
||||
%x1load = load i64, i64* %x1ptr
|
||||
|
|
Loading…
Reference in New Issue