forked from OSchip/llvm-project
[X86] Add more FMA3 patterns to cover a load in all 3 possible positions.
This matches what we already do for AVX512. The peephole pass makes up for this in most if not all cases. But this makes isel behavior for these consistent with every other instruction. llvm-svn: 312613
This commit is contained in:
parent
112a6bac72
commit
eec768b5c4
|
@ -6914,6 +6914,8 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
|
|||
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
|
||||
(_.ScalarLdFrag addr:$src3), _.FRC:$src1))), 1>;
|
||||
|
||||
// One pattern is 312 order so that the load is in a different place from the
|
||||
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
|
||||
defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
|
||||
(null_frag),
|
||||
(_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
|
||||
|
@ -6921,8 +6923,8 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
|
|||
(null_frag),
|
||||
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
|
||||
_.FRC:$src2))),
|
||||
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
|
||||
(_.ScalarLdFrag addr:$src3), _.FRC:$src2))), 1>;
|
||||
(set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3),
|
||||
_.FRC:$src1, _.FRC:$src2))), 1>;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
// FMA3 - Intel 3 operand Fused Multiply-Add instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
|
||||
// below, both the register and memory variants are commutable.
|
||||
// For all FMA opcodes declared in fma3p_rm_* and fma3s_rm_* milticlasses
|
||||
// defined below, both the register and memory variants are commutable.
|
||||
// For the register form the commutable operands are 1, 2 and 3.
|
||||
// For the memory variant the folded operand must be in 3. Thus,
|
||||
// in that case, only the operands 1 and 2 can be swapped.
|
||||
|
@ -34,56 +34,85 @@
|
|||
// operands 1 and 3 (register forms only): *231* --> *213*;
|
||||
// operands 2 and 3 (register forms only): *231* --> *231*(no changes).
|
||||
|
||||
let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
|
||||
multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
|
||||
PatFrag MemFrag128, PatFrag MemFrag256,
|
||||
ValueType OpVT128, ValueType OpVT256,
|
||||
SDPatternOperator Op = null_frag> {
|
||||
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2, VR128:$src3),
|
||||
multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
|
||||
SDNode Op> {
|
||||
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, RC:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst, (OpVT128 (Op VR128:$src2,
|
||||
VR128:$src1, VR128:$src3)))]>;
|
||||
[(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>;
|
||||
|
||||
let mayLoad = 1 in
|
||||
def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
|
||||
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, x86memop:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
|
||||
(MemFrag128 addr:$src3))))]>;
|
||||
|
||||
def Yr : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
|
||||
(ins VR256:$src1, VR256:$src2, VR256:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
|
||||
VR256:$src3)))]>, VEX_L;
|
||||
|
||||
let mayLoad = 1 in
|
||||
def Ym : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
|
||||
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set VR256:$dst,
|
||||
(OpVT256 (Op VR256:$src2, VR256:$src1,
|
||||
(MemFrag256 addr:$src3))))]>, VEX_L;
|
||||
[(set RC:$dst, (VT (Op RC:$src2, RC:$src1,
|
||||
(MemFrag addr:$src3))))]>;
|
||||
}
|
||||
|
||||
multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
|
||||
SDNode Op> {
|
||||
let hasSideEffects = 0 in
|
||||
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, RC:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[]>;
|
||||
|
||||
let mayLoad = 1 in
|
||||
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, x86memop:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3),
|
||||
RC:$src1)))]>;
|
||||
}
|
||||
|
||||
multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
|
||||
ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
|
||||
SDNode Op> {
|
||||
let hasSideEffects = 0 in
|
||||
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, RC:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[]>;
|
||||
|
||||
// Pattern is 312 order so that the load is in a different place from the
|
||||
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
|
||||
let mayLoad = 1 in
|
||||
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, x86memop:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1,
|
||||
RC:$src2)))]>;
|
||||
}
|
||||
|
||||
let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
|
||||
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
|
||||
string OpcodeStr, string PackTy, string Suff,
|
||||
PatFrag MemFrag128, PatFrag MemFrag256,
|
||||
SDNode Op, ValueType OpTy128, ValueType OpTy256> {
|
||||
defm NAME#213#Suff : fma3p_rm<opc213,
|
||||
!strconcat(OpcodeStr, "213", PackTy),
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
|
||||
defm NAME#132#Suff : fma3p_rm<opc132,
|
||||
!strconcat(OpcodeStr, "132", PackTy),
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256>;
|
||||
defm NAME#231#Suff : fma3p_rm<opc231,
|
||||
!strconcat(OpcodeStr, "231", PackTy),
|
||||
MemFrag128, MemFrag256, OpTy128, OpTy256>;
|
||||
defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
|
||||
VR128, OpTy128, f128mem, MemFrag128, Op>;
|
||||
defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
|
||||
VR128, OpTy128, f128mem, MemFrag128, Op>;
|
||||
defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
|
||||
VR128, OpTy128, f128mem, MemFrag128, Op>;
|
||||
|
||||
defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
|
||||
VR256, OpTy256, f256mem, MemFrag256, Op>,
|
||||
VEX_L;
|
||||
defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
|
||||
VR256, OpTy256, f256mem, MemFrag256, Op>,
|
||||
VEX_L;
|
||||
defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
|
||||
VR256, OpTy256, f256mem, MemFrag256, Op>,
|
||||
VEX_L;
|
||||
}
|
||||
|
||||
// Fused Multiply-Add
|
||||
|
@ -93,11 +122,9 @@ let ExeDomain = SSEPackedSingle in {
|
|||
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
|
||||
loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>;
|
||||
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
|
||||
loadv4f32, loadv8f32, X86Fmaddsub,
|
||||
v4f32, v8f32>;
|
||||
loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32>;
|
||||
defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
|
||||
loadv4f32, loadv8f32, X86Fmsubadd,
|
||||
v4f32, v8f32>;
|
||||
loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32>;
|
||||
}
|
||||
|
||||
let ExeDomain = SSEPackedDouble in {
|
||||
|
@ -138,10 +165,9 @@ let ExeDomain = SSEPackedDouble in {
|
|||
// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
|
||||
// Please see more detailed comment at the very beginning of the section
|
||||
// defining FMA3 opcodes above.
|
||||
let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
|
||||
multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
|
||||
multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr,
|
||||
X86MemOperand x86memop, RegisterClass RC,
|
||||
SDPatternOperator OpNode = null_frag> {
|
||||
SDPatternOperator OpNode> {
|
||||
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, RC:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
|
@ -157,6 +183,61 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
|
|||
(OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
|
||||
}
|
||||
|
||||
multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr,
|
||||
X86MemOperand x86memop, RegisterClass RC,
|
||||
SDPatternOperator OpNode> {
|
||||
let hasSideEffects = 0 in
|
||||
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, RC:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[]>;
|
||||
|
||||
let mayLoad = 1 in
|
||||
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, x86memop:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set RC:$dst,
|
||||
(OpNode RC:$src2, (load addr:$src3), RC:$src1))]>;
|
||||
}
|
||||
|
||||
multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
|
||||
X86MemOperand x86memop, RegisterClass RC,
|
||||
SDPatternOperator OpNode> {
|
||||
let hasSideEffects = 0 in
|
||||
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, RC:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[]>;
|
||||
|
||||
// Pattern is 312 order so that the load is in a different place from the
|
||||
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
|
||||
let mayLoad = 1 in
|
||||
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
|
||||
(ins RC:$src1, RC:$src2, x86memop:$src3),
|
||||
!strconcat(OpcodeStr,
|
||||
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
|
||||
[(set RC:$dst,
|
||||
(OpNode (load addr:$src3), RC:$src1, RC:$src2))]>;
|
||||
}
|
||||
|
||||
let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
|
||||
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
|
||||
string OpStr, string PackTy, string Suff,
|
||||
SDNode OpNode, RegisterClass RC,
|
||||
X86MemOperand x86memop> {
|
||||
let Predicates = [HasFMA, NoAVX512] in {
|
||||
defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy),
|
||||
x86memop, RC, OpNode>;
|
||||
defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy),
|
||||
x86memop, RC, OpNode>;
|
||||
defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy),
|
||||
x86memop, RC, OpNode>;
|
||||
}
|
||||
}
|
||||
|
||||
// These FMA*_Int instructions are defined specially for being used when
|
||||
// the scalar FMA intrinsics are lowered to machine instructions, and in that
|
||||
// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
|
||||
|
@ -188,20 +269,6 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
|
|||
[]>;
|
||||
}
|
||||
|
||||
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
|
||||
string OpStr, string PackTy, string Suff,
|
||||
SDNode OpNode, RegisterClass RC,
|
||||
X86MemOperand x86memop> {
|
||||
let Predicates = [HasFMA, NoAVX512] in {
|
||||
defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
|
||||
x86memop, RC>;
|
||||
defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
|
||||
x86memop, RC, OpNode>;
|
||||
defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
|
||||
x86memop, RC>;
|
||||
}
|
||||
}
|
||||
|
||||
// The FMA 213 form is created for lowering of scalar FMA intrinscis
|
||||
// to machine instructions.
|
||||
// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
|
||||
|
|
Loading…
Reference in New Issue