[AMDGPU] Enable v4f16 and above for v_pk_fma instructions

Summary:
If isel is presented with <2 x half> vectors then it will correctly select
v_pk_fma style instructions.
If isel is presented with e.g. <4 x half> vectors it will scalarize, unlike for
other instruction types (such as fadd, fmul etc.)

Added extra support to enable this. Updated one of the tests to include a test
for this (as well as extending the test to GFX9)

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, tpr, t-tye, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65325

Change-Id: I50a4577a3f8223fb53992af3b7d26121f65b71ee
llvm-svn: 367206
This commit is contained in:
David Stuttard 2019-07-29 08:15:10 +00:00
parent 8538060103
commit 20235ef3e7
4 changed files with 214 additions and 84 deletions

View File

@ -653,6 +653,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
setOperationAction(ISD::FMA, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
@ -3971,6 +3972,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}
SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert(VT == MVT::v4i16 || VT == MVT::v4f16);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
SDValue Lo1, Hi1;
std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
SDValue Lo2, Hi2;
std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
SDLoc SL(Op);
SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
Op->getFlags());
SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
Op->getFlags());
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@ -4023,6 +4048,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
case ISD::FMA:
return splitTernaryVectorOp(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:

View File

@ -331,6 +331,7 @@ public:
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,

View File

@ -3,74 +3,96 @@
; GCN-LABEL: {{^}}addMul2D:
; GFX1010: v_fmac_f16
; GFX1010: v_fmac_f16
define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 {
%5 = extractelement <2 x i32> %2, i64 1
%6 = icmp sgt i32 %5, 0
br i1 %6, label %7, label %38
define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly %arg, float addrspace(4)* nocapture readonly %arg1, <2 x i32> %arg2, i32 %arg3) local_unnamed_addr #0 {
bb:
%tmp = extractelement <2 x i32> %arg2, i64 1
%tmp4 = icmp sgt i32 %tmp, 0
br i1 %tmp4, label %bb5, label %bb36
7: ; preds = %4
%8 = extractelement <2 x i32> %2, i64 0
%9 = icmp sgt i32 %8, 0
br label %10
bb5: ; preds = %bb
%tmp6 = extractelement <2 x i32> %arg2, i64 0
%tmp7 = icmp sgt i32 %tmp6, 0
br label %bb8
10: ; preds = %34, %7
%11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ]
%12 = phi i32 [ 0, %7 ], [ %36, %34 ]
br i1 %9, label %13, label %34
bb8: ; preds = %bb32, %bb5
%tmp9 = phi <4 x half> [ zeroinitializer, %bb5 ], [ %tmp33, %bb32 ]
%tmp10 = phi i32 [ 0, %bb5 ], [ %tmp34, %bb32 ]
br i1 %tmp7, label %bb11, label %bb32
13: ; preds = %10
%14 = mul nsw i32 %12, %3
%15 = mul nsw i32 %12, %8
br label %16
bb11: ; preds = %bb8
%tmp12 = mul nsw i32 %tmp10, %arg3
%tmp13 = mul nsw i32 %tmp10, %tmp6
br label %bb14
16: ; preds = %16, %13
%17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ]
%18 = phi i32 [ 0, %13 ], [ %32, %16 ]
%19 = add nsw i32 %18, %14
%20 = sext i32 %19 to i64
%21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20
%22 = load <4 x i8>, <4 x i8>* %21, align 4
%23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8
%24 = add nsw i32 %18, %15
%25 = sext i32 %24 to i64
%26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25
%27 = load float, float addrspace(4)* %26, align 4
%28 = fptrunc float %27 to half
%29 = insertelement <4 x half> undef, half %28, i32 0
%30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer
%31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17)
%32 = add nuw nsw i32 %18, 1
%33 = icmp eq i32 %32, %8
br i1 %33, label %34, label %16
bb14: ; preds = %bb14, %bb11
%tmp15 = phi <4 x half> [ %tmp9, %bb11 ], [ %tmp29, %bb14 ]
%tmp16 = phi i32 [ 0, %bb11 ], [ %tmp30, %bb14 ]
%tmp17 = add nsw i32 %tmp16, %tmp12
%tmp18 = sext i32 %tmp17 to i64
%tmp19 = getelementptr inbounds <4 x i8>, <4 x i8>* %arg, i64 %tmp18
%tmp20 = load <4 x i8>, <4 x i8>* %tmp19, align 4
%tmp21 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %tmp20)
%tmp22 = add nsw i32 %tmp16, %tmp13
%tmp23 = sext i32 %tmp22 to i64
%tmp24 = getelementptr inbounds float, float addrspace(4)* %arg1, i64 %tmp23
%tmp25 = load float, float addrspace(4)* %tmp24, align 4
%tmp26 = fptrunc float %tmp25 to half
%tmp27 = insertelement <4 x half> undef, half %tmp26, i32 0
%tmp28 = shufflevector <4 x half> %tmp27, <4 x half> undef, <4 x i32> zeroinitializer
%vec.A.0 = extractelement <4 x half> %tmp21, i32 0
%vec.B.0 = extractelement <4 x half> %tmp28, i32 0
%vec.C.0 = extractelement <4 x half> %tmp15, i32 0
%vec.res.0 = tail call half @llvm.fmuladd.f16(half %vec.A.0, half %vec.B.0, half %vec.C.0)
%vec.A.1 = extractelement <4 x half> %tmp21, i32 1
%vec.B.1 = extractelement <4 x half> %tmp28, i32 1
%vec.C.1 = extractelement <4 x half> %tmp15, i32 1
%vec.res.1 = tail call half @llvm.fmuladd.f16(half %vec.A.1, half %vec.B.1, half %vec.C.1)
%vec.A.2 = extractelement <4 x half> %tmp21, i32 2
%vec.B.2 = extractelement <4 x half> %tmp28, i32 2
%vec.C.2 = extractelement <4 x half> %tmp15, i32 2
%vec.res.2 = tail call half @llvm.fmuladd.f16(half %vec.A.2, half %vec.B.2, half %vec.C.2)
%vec.A.3 = extractelement <4 x half> %tmp21, i32 3
%vec.B.3 = extractelement <4 x half> %tmp28, i32 3
%vec.C.3 = extractelement <4 x half> %tmp15, i32 3
%vec.res.3 = tail call half @llvm.fmuladd.f16(half %vec.A.3, half %vec.B.3, half %vec.C.3)
%full.res.0 = insertelement <4 x half> undef, half %vec.res.0, i32 0
%full.res.1 = insertelement <4 x half> %full.res.0, half %vec.res.1, i32 1
%full.res.2 = insertelement <4 x half> %full.res.1, half %vec.res.2, i32 2
%tmp29 = insertelement <4 x half> %full.res.2, half %vec.res.3, i32 3
%tmp30 = add nuw nsw i32 %tmp16, 1
%tmp31 = icmp eq i32 %tmp30, %tmp6
br i1 %tmp31, label %bb32, label %bb14
34: ; preds = %16, %10
%35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ]
%36 = add nuw nsw i32 %12, 1
%37 = icmp eq i32 %36, %5
br i1 %37, label %38, label %10
bb32: ; preds = %bb14, %bb8
%tmp33 = phi <4 x half> [ %tmp9, %bb8 ], [ %tmp29, %bb14 ]
%tmp34 = add nuw nsw i32 %tmp10, 1
%tmp35 = icmp eq i32 %tmp34, %tmp
br i1 %tmp35, label %bb36, label %bb8
38: ; preds = %34, %4
%39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ]
ret <4 x half> %39
bb36: ; preds = %bb32, %bb
%tmp37 = phi <4 x half> [ zeroinitializer, %bb ], [ %tmp33, %bb32 ]
ret <4 x half> %tmp37
}
define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 {
%2 = extractelement <4 x i8> %0, i64 0
%3 = uitofp i8 %2 to half
%4 = insertelement <4 x half> undef, half %3, i32 0
%5 = extractelement <4 x i8> %0, i64 1
%6 = uitofp i8 %5 to half
%7 = insertelement <4 x half> %4, half %6, i32 1
%8 = extractelement <4 x i8> %0, i64 2
%9 = uitofp i8 %8 to half
%10 = insertelement <4 x half> %7, half %9, i32 2
%11 = extractelement <4 x i8> %0, i64 3
%12 = uitofp i8 %11 to half
%13 = insertelement <4 x half> %10, half %12, i32 3
ret <4 x half> %13
; Function Attrs: norecurse nounwind readnone
define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %arg) local_unnamed_addr #1 {
bb:
%tmp = extractelement <4 x i8> %arg, i64 0
%tmp1 = uitofp i8 %tmp to half
%tmp2 = insertelement <4 x half> undef, half %tmp1, i32 0
%tmp3 = extractelement <4 x i8> %arg, i64 1
%tmp4 = uitofp i8 %tmp3 to half
%tmp5 = insertelement <4 x half> %tmp2, half %tmp4, i32 1
%tmp6 = extractelement <4 x i8> %arg, i64 2
%tmp7 = uitofp i8 %tmp6 to half
%tmp8 = insertelement <4 x half> %tmp5, half %tmp7, i32 2
%tmp9 = extractelement <4 x i8> %arg, i64 3
%tmp10 = uitofp i8 %tmp9 to half
%tmp11 = insertelement <4 x half> %tmp8, half %tmp10, i32 3
ret <4 x half> %tmp11
}
declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>)
declare half @llvm.fmuladd.f16(half, half, half)
attributes #0 = { convergent nounwind readonly}
attributes #1 = { norecurse nounwind readnone }

View File

@ -1,8 +1,10 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s
declare half @llvm.fma.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
declare <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
; GCN-LABEL: {{^}}fma_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@ -13,7 +15,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16(
@ -38,8 +40,8 @@ define amdgpu_kernel void @fma_f16(
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16_imm_a(
@ -61,8 +63,8 @@ define amdgpu_kernel void @fma_f16_imm_a(
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16_imm_b(
@ -84,8 +86,8 @@ define amdgpu_kernel void @fma_f16_imm_b(
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16_imm_c(
@ -127,9 +129,11 @@ define amdgpu_kernel void @fma_f16_imm_c(
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16(
@ -150,14 +154,14 @@ define amdgpu_kernel void @fma_v2f16(
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}}
; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SIVI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
@ -172,9 +176,11 @@ define amdgpu_kernel void @fma_v2f16(
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]]
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_a(
@ -192,11 +198,11 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@ -215,9 +221,11 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_b(
@ -235,11 +243,11 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@ -265,6 +273,7 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; GCN-NOT: and
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@ -278,3 +287,74 @@ define amdgpu_kernel void @fma_v2f16_imm_c(
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fma_v4f16
; GCN: buffer_load_dwordx2 v{{\[}}[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]{{\]}}
; GCN: buffer_load_dwordx2 v{{\[}}[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]{{\]}}
; GCN: buffer_load_dwordx2 v{{\[}}[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]{{\]}}
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V4_F16_HI]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_2:[0-9]+]], 16, v[[A_V4_F16_HI]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V4_F16_LO]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V4_F16_LO]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_V4_F16_HI]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_V4_F16_HI]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[A_V4_F16_LO]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[A_V4_F16_HI]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_2:[0-9]+]], v[[B_V4_F16_LO]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_3:[0-9]+]], v[[B_V4_F16_HI]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_2:[0-9]+]], v[[C_V4_F16_LO]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_3:[0-9]+]], v[[C_V4_F16_HI]]
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
; SI-DAG: v_fma_f32 v[[R_F32_2:[0-9]+]], v[[A_F32_2]], v[[B_F32_2]], v[[C_F32_2]]
; SI-DAG: v_fma_f32 v[[R_F32_3:[0-9]+]], v[[A_F32_3]], v[[B_F32_3]], v[[C_F32_3]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_2:[0-9]+]], v[[R_F32_2]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[R_F32_3]]
; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_0:[0-9]]], 16, v[[R_F16_2]]
; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_1:[0-9]]], 16, v[[R_F16_3]]
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V4_F16_HI]]
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[A_F16_0]], v[[B_F16_0]], v[[C_F16_0]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]]
; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]]
; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
; GCN: buffer_store_dwordx2 v{{\[}}[[R_V4_F16_LO]]:[[R_V4_F16_HI]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @fma_v4f16(
<4 x half> addrspace(1)* %r,
<4 x half> addrspace(1)* %a,
<4 x half> addrspace(1)* %b,
<4 x half> addrspace(1)* %c) {
%a.val = load <4 x half>, <4 x half> addrspace(1)* %a
%b.val = load <4 x half>, <4 x half> addrspace(1)* %b
%c.val = load <4 x half>, <4 x half> addrspace(1)* %c
%r.val = call <4 x half> @llvm.fma.v4f16(<4 x half> %a.val, <4 x half> %b.val, <4 x half> %c.val)
store <4 x half> %r.val, <4 x half> addrspace(1)* %r
ret void
}