forked from OSchip/llvm-project
[AMDGPU] gfx10 v_fmac_f16 operand folding
Fold immediates into v_fmac_f16. Differential Revision: https://reviews.llvm.org/D68037 llvm-svn: 372906
This commit is contained in:
parent
ac3243c3e1
commit
d3b2b97195
|
@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
|
|||
switch (Opc) {
|
||||
case AMDGPU::V_MAC_F32_e64:
|
||||
case AMDGPU::V_MAC_F16_e64:
|
||||
case AMDGPU::V_FMAC_F32_e64: {
|
||||
case AMDGPU::V_FMAC_F32_e64:
|
||||
case AMDGPU::V_FMAC_F16_e64: {
|
||||
// Special case for mac. Since this is replaced with mad when folded into
|
||||
// src2, we need to check the legality for the final instruction.
|
||||
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
|
||||
if (static_cast<int>(OpNo) == Src2Idx) {
|
||||
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
|
||||
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
|
||||
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
|
||||
Opc == AMDGPU::V_FMAC_F16_e64;
|
||||
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
|
||||
Opc == AMDGPU::V_FMAC_F32_e64;
|
||||
|
||||
unsigned Opc = IsFMA ?
|
||||
AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
|
||||
(IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
|
||||
(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
|
||||
const MCInstrDesc &MadDesc = TII->get(Opc);
|
||||
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
|
||||
}
|
||||
|
@ -314,12 +318,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
|
|||
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
|
||||
unsigned Opc = MI->getOpcode();
|
||||
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
|
||||
Opc == AMDGPU::V_FMAC_F32_e64) &&
|
||||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
|
||||
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
|
||||
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
|
||||
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
|
||||
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
|
||||
Opc == AMDGPU::V_FMAC_F16_e64;
|
||||
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
|
||||
Opc == AMDGPU::V_FMAC_F32_e64;
|
||||
unsigned NewOpc = IsFMA ?
|
||||
AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
|
||||
(IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
|
||||
(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
|
||||
|
||||
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
|
||||
// to fold the operand.
|
||||
|
|
|
@ -126,7 +126,7 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, f
|
|||
; GFX8_10: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
; GFX10-DENORM: v_fmac_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX10-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
; GFX10-FLUSH: v_sub_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
|
||||
%x = bitcast i16 %x.arg to half
|
||||
|
@ -152,7 +152,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i
|
|||
; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
|
||||
; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}}
|
||||
; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]]
|
||||
; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
|
||||
; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}}
|
||||
|
||||
; GCN-DAG: buffer_store_short [[MUL2]]
|
||||
; GCN-DAG: buffer_store_short [[MAD]]
|
||||
|
@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i
|
|||
; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
|
||||
; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
|
||||
; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]]
|
||||
; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], |[[X]]|, 2.0
|
||||
; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}}
|
||||
|
||||
; GCN-DAG: buffer_store_short [[MUL2]]
|
||||
; GCN-DAG: buffer_store_short [[MAD]]
|
||||
|
@ -201,8 +201,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i
|
|||
; GFX10-FLUSH: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |{{s[0-9]+}}|
|
||||
; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]]
|
||||
; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]]
|
||||
; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0
|
||||
; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X]]|, 2.0
|
||||
; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, s{{[0-9]+}}
|
||||
; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, s{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
|
||||
%x = bitcast i16 %x.arg to half
|
||||
|
|
Loading…
Reference in New Issue