From eec768b5c402ef47ec584d5b72dccc8a3cc29ee7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 6 Sep 2017 03:35:58 +0000 Subject: [PATCH] [X86] Add more FMA3 patterns to cover a load in all 3 possible positions. This matches what we already do for AVX512. The peephole pass makes up for this in most if not all cases. But this makes isel behavior for these consistent with every other instruction. llvm-svn: 312613 --- llvm/lib/Target/X86/X86InstrAVX512.td | 6 +- llvm/lib/Target/X86/X86InstrFMA.td | 213 +++++++++++++++++--------- 2 files changed, 144 insertions(+), 75 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index e087b4e7fab5..0979c9658a0e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6914,6 +6914,8 @@ multiclass avx512_fma3s_all opc213, bits<8> opc231, bits<8> opc132, (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), 1>; + // One pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. defm NAME#132#SUFF#Z: avx512_fma3s_common opc213, bits<8> opc231, bits<8> opc132, (null_frag), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, _.FRC:$src2))), - (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, - (_.ScalarLdFrag addr:$src3), _.FRC:$src2))), 1>; + (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3), + _.FRC:$src1, _.FRC:$src2))), 1>; } } diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index 163fe5db76ef..453dcd83df1f 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -15,8 +15,8 @@ // FMA3 - Intel 3 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// -// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined -// below, both the register and memory variants are commutable. +// For all FMA opcodes declared in fma3p_rm_* and fma3s_rm_* milticlasses +// defined below, both the register and memory variants are commutable. // For the register form the commutable operands are 1, 2 and 3. // For the memory variant the folded operand must be in 3. Thus, // in that case, only the operands 1 and 2 can be swapped. @@ -34,56 +34,85 @@ // operands 1 and 3 (register forms only): *231* --> *213*; // operands 2 and 3 (register forms only): *231* --> *231*(no changes). -let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in -multiclass fma3p_rm opc, string OpcodeStr, - PatFrag MemFrag128, PatFrag MemFrag256, - ValueType OpVT128, ValueType OpVT256, - SDPatternOperator Op = null_frag> { - def r : FMA3 opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op> { + def r : FMA3; + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>; let mayLoad = 1 in - def m : FMA3; - - def Yr : FMA3, VEX_L; - - let mayLoad = 1 in - def Ym : FMA3, VEX_L; + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, + (MemFrag addr:$src3))))]>; } +multiclass fma3p_rm_231 opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op> { + let hasSideEffects = 0 in + def r : FMA3; + + let mayLoad = 1 in + def m : FMA3; +} + +multiclass fma3p_rm_132 opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op> { + let hasSideEffects = 0 in + def r : FMA3; + + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + let mayLoad = 1 in + def m : FMA3; +} + +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in multiclass fma3p_forms opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, string Suff, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - defm NAME#213#Suff : fma3p_rm; - defm NAME#132#Suff : fma3p_rm; - defm NAME#231#Suff : fma3p_rm; + defm NAME#213#Suff : fma3p_rm_213; + defm NAME#231#Suff : fma3p_rm_231; + defm NAME#132#Suff : fma3p_rm_132; + + defm NAME#213#Suff#Y : fma3p_rm_213, + VEX_L; + defm NAME#231#Suff#Y : fma3p_rm_231, + VEX_L; + defm NAME#132#Suff#Y : fma3p_rm_132, + VEX_L; } // Fused Multiply-Add @@ -93,11 +122,9 @@ let ExeDomain = SSEPackedSingle in { defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>; defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS", - loadv4f32, loadv8f32, X86Fmaddsub, - v4f32, v8f32>; + loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32>; defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS", - loadv4f32, loadv8f32, X86Fmsubadd, - v4f32, v8f32>; + loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32>; } let ExeDomain = SSEPackedDouble in { @@ -138,23 +165,77 @@ let ExeDomain = SSEPackedDouble in { // FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; // Please see more detailed comment at the very beginning of the section // defining FMA3 opcodes above. -let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in -multiclass fma3s_rm opc, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - SDPatternOperator OpNode = null_frag> { - def r : FMA3; +multiclass fma3s_rm_213 opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode> { + def r : FMA3; let mayLoad = 1 in - def m : FMA3; + def m : FMA3; +} + +multiclass fma3s_rm_231 opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode> { + let hasSideEffects = 0 in + def r : FMA3; + + let mayLoad = 1 in + def m : FMA3; +} + +multiclass fma3s_rm_132 opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode> { + let hasSideEffects = 0 in + def r : FMA3; + + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + let mayLoad = 1 in + def m : FMA3; +} + +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +multiclass fma3s_forms opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, string Suff, + SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop> { + let Predicates = [HasFMA, NoAVX512] in { + defm NAME#213#Suff : fma3s_rm_213; + defm NAME#231#Suff : fma3s_rm_231; + defm NAME#132#Suff : fma3s_rm_132; + } } // These FMA*_Int instructions are defined specially for being used when @@ -188,20 +269,6 @@ multiclass fma3s_rm_int opc, string OpcodeStr, []>; } -multiclass fma3s_forms opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, string Suff, - SDNode OpNode, RegisterClass RC, - X86MemOperand x86memop> { - let Predicates = [HasFMA, NoAVX512] in { - defm NAME#132#Suff : fma3s_rm; - defm NAME#213#Suff : fma3s_rm; - defm NAME#231#Suff : fma3s_rm; - } -} - // The FMA 213 form is created for lowering of scalar FMA intrinscis // to machine instructions. // The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands