forked from OSchip/llvm-project
[AMDGPU] Enable v4f16 and above for v_pk_fma instructions
Summary: If isel is presented with <2 x half> vectors then it will correctly select v_pk_fma style instructions. If isel is presented with e.g. <4 x half> vectors it will scalarize, unlike for other instruction types (such as fadd, fmul etc.) Added extra support to enable this. Updated one of the tests to include a test for this (as well as extending the test to GFX9) Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65325 Change-Id: I50a4577a3f8223fb53992af3b7d26121f65b71ee llvm-svn: 367206
This commit is contained in:
parent
8538060103
commit
20235ef3e7
|
@ -653,6 +653,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
|
||||
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::FMA, MVT::v4f16, Custom);
|
||||
|
||||
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
|
||||
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
|
||||
|
@ -3971,6 +3972,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
|
|||
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
unsigned Opc = Op.getOpcode();
|
||||
EVT VT = Op.getValueType();
|
||||
assert(VT == MVT::v4i16 || VT == MVT::v4f16);
|
||||
|
||||
SDValue Lo0, Hi0;
|
||||
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
|
||||
SDValue Lo1, Hi1;
|
||||
std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
|
||||
SDValue Lo2, Hi2;
|
||||
std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
|
||||
|
||||
SDLoc SL(Op);
|
||||
|
||||
SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
|
||||
Op->getFlags());
|
||||
SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
|
||||
Op->getFlags());
|
||||
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
|
||||
}
|
||||
|
||||
|
||||
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
switch (Op.getOpcode()) {
|
||||
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
||||
|
@ -4023,6 +4048,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
case ISD::FMINNUM:
|
||||
case ISD::FMAXNUM:
|
||||
return lowerFMINNUM_FMAXNUM(Op, DAG);
|
||||
case ISD::FMA:
|
||||
return splitTernaryVectorOp(Op, DAG);
|
||||
case ISD::SHL:
|
||||
case ISD::SRA:
|
||||
case ISD::SRL:
|
||||
|
|
|
@ -331,6 +331,7 @@ public:
|
|||
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
|
||||
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
|
||||
|
||||
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
|
||||
|
|
|
@ -3,74 +3,96 @@
|
|||
; GCN-LABEL: {{^}}addMul2D:
|
||||
; GFX1010: v_fmac_f16
|
||||
; GFX1010: v_fmac_f16
|
||||
define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 {
|
||||
%5 = extractelement <2 x i32> %2, i64 1
|
||||
%6 = icmp sgt i32 %5, 0
|
||||
br i1 %6, label %7, label %38
|
||||
define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly %arg, float addrspace(4)* nocapture readonly %arg1, <2 x i32> %arg2, i32 %arg3) local_unnamed_addr #0 {
|
||||
bb:
|
||||
%tmp = extractelement <2 x i32> %arg2, i64 1
|
||||
%tmp4 = icmp sgt i32 %tmp, 0
|
||||
br i1 %tmp4, label %bb5, label %bb36
|
||||
|
||||
7: ; preds = %4
|
||||
%8 = extractelement <2 x i32> %2, i64 0
|
||||
%9 = icmp sgt i32 %8, 0
|
||||
br label %10
|
||||
bb5: ; preds = %bb
|
||||
%tmp6 = extractelement <2 x i32> %arg2, i64 0
|
||||
%tmp7 = icmp sgt i32 %tmp6, 0
|
||||
br label %bb8
|
||||
|
||||
10: ; preds = %34, %7
|
||||
%11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ]
|
||||
%12 = phi i32 [ 0, %7 ], [ %36, %34 ]
|
||||
br i1 %9, label %13, label %34
|
||||
bb8: ; preds = %bb32, %bb5
|
||||
%tmp9 = phi <4 x half> [ zeroinitializer, %bb5 ], [ %tmp33, %bb32 ]
|
||||
%tmp10 = phi i32 [ 0, %bb5 ], [ %tmp34, %bb32 ]
|
||||
br i1 %tmp7, label %bb11, label %bb32
|
||||
|
||||
13: ; preds = %10
|
||||
%14 = mul nsw i32 %12, %3
|
||||
%15 = mul nsw i32 %12, %8
|
||||
br label %16
|
||||
bb11: ; preds = %bb8
|
||||
%tmp12 = mul nsw i32 %tmp10, %arg3
|
||||
%tmp13 = mul nsw i32 %tmp10, %tmp6
|
||||
br label %bb14
|
||||
|
||||
16: ; preds = %16, %13
|
||||
%17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ]
|
||||
%18 = phi i32 [ 0, %13 ], [ %32, %16 ]
|
||||
%19 = add nsw i32 %18, %14
|
||||
%20 = sext i32 %19 to i64
|
||||
%21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20
|
||||
%22 = load <4 x i8>, <4 x i8>* %21, align 4
|
||||
%23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8
|
||||
%24 = add nsw i32 %18, %15
|
||||
%25 = sext i32 %24 to i64
|
||||
%26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25
|
||||
%27 = load float, float addrspace(4)* %26, align 4
|
||||
%28 = fptrunc float %27 to half
|
||||
%29 = insertelement <4 x half> undef, half %28, i32 0
|
||||
%30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer
|
||||
%31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17)
|
||||
%32 = add nuw nsw i32 %18, 1
|
||||
%33 = icmp eq i32 %32, %8
|
||||
br i1 %33, label %34, label %16
|
||||
bb14: ; preds = %bb14, %bb11
|
||||
%tmp15 = phi <4 x half> [ %tmp9, %bb11 ], [ %tmp29, %bb14 ]
|
||||
%tmp16 = phi i32 [ 0, %bb11 ], [ %tmp30, %bb14 ]
|
||||
%tmp17 = add nsw i32 %tmp16, %tmp12
|
||||
%tmp18 = sext i32 %tmp17 to i64
|
||||
%tmp19 = getelementptr inbounds <4 x i8>, <4 x i8>* %arg, i64 %tmp18
|
||||
%tmp20 = load <4 x i8>, <4 x i8>* %tmp19, align 4
|
||||
%tmp21 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %tmp20)
|
||||
%tmp22 = add nsw i32 %tmp16, %tmp13
|
||||
%tmp23 = sext i32 %tmp22 to i64
|
||||
%tmp24 = getelementptr inbounds float, float addrspace(4)* %arg1, i64 %tmp23
|
||||
%tmp25 = load float, float addrspace(4)* %tmp24, align 4
|
||||
%tmp26 = fptrunc float %tmp25 to half
|
||||
%tmp27 = insertelement <4 x half> undef, half %tmp26, i32 0
|
||||
%tmp28 = shufflevector <4 x half> %tmp27, <4 x half> undef, <4 x i32> zeroinitializer
|
||||
%vec.A.0 = extractelement <4 x half> %tmp21, i32 0
|
||||
%vec.B.0 = extractelement <4 x half> %tmp28, i32 0
|
||||
%vec.C.0 = extractelement <4 x half> %tmp15, i32 0
|
||||
%vec.res.0 = tail call half @llvm.fmuladd.f16(half %vec.A.0, half %vec.B.0, half %vec.C.0)
|
||||
%vec.A.1 = extractelement <4 x half> %tmp21, i32 1
|
||||
%vec.B.1 = extractelement <4 x half> %tmp28, i32 1
|
||||
%vec.C.1 = extractelement <4 x half> %tmp15, i32 1
|
||||
%vec.res.1 = tail call half @llvm.fmuladd.f16(half %vec.A.1, half %vec.B.1, half %vec.C.1)
|
||||
%vec.A.2 = extractelement <4 x half> %tmp21, i32 2
|
||||
%vec.B.2 = extractelement <4 x half> %tmp28, i32 2
|
||||
%vec.C.2 = extractelement <4 x half> %tmp15, i32 2
|
||||
%vec.res.2 = tail call half @llvm.fmuladd.f16(half %vec.A.2, half %vec.B.2, half %vec.C.2)
|
||||
%vec.A.3 = extractelement <4 x half> %tmp21, i32 3
|
||||
%vec.B.3 = extractelement <4 x half> %tmp28, i32 3
|
||||
%vec.C.3 = extractelement <4 x half> %tmp15, i32 3
|
||||
%vec.res.3 = tail call half @llvm.fmuladd.f16(half %vec.A.3, half %vec.B.3, half %vec.C.3)
|
||||
%full.res.0 = insertelement <4 x half> undef, half %vec.res.0, i32 0
|
||||
%full.res.1 = insertelement <4 x half> %full.res.0, half %vec.res.1, i32 1
|
||||
%full.res.2 = insertelement <4 x half> %full.res.1, half %vec.res.2, i32 2
|
||||
%tmp29 = insertelement <4 x half> %full.res.2, half %vec.res.3, i32 3
|
||||
%tmp30 = add nuw nsw i32 %tmp16, 1
|
||||
%tmp31 = icmp eq i32 %tmp30, %tmp6
|
||||
br i1 %tmp31, label %bb32, label %bb14
|
||||
|
||||
34: ; preds = %16, %10
|
||||
%35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ]
|
||||
%36 = add nuw nsw i32 %12, 1
|
||||
%37 = icmp eq i32 %36, %5
|
||||
br i1 %37, label %38, label %10
|
||||
bb32: ; preds = %bb14, %bb8
|
||||
%tmp33 = phi <4 x half> [ %tmp9, %bb8 ], [ %tmp29, %bb14 ]
|
||||
%tmp34 = add nuw nsw i32 %tmp10, 1
|
||||
%tmp35 = icmp eq i32 %tmp34, %tmp
|
||||
br i1 %tmp35, label %bb36, label %bb8
|
||||
|
||||
38: ; preds = %34, %4
|
||||
%39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ]
|
||||
ret <4 x half> %39
|
||||
bb36: ; preds = %bb32, %bb
|
||||
%tmp37 = phi <4 x half> [ zeroinitializer, %bb ], [ %tmp33, %bb32 ]
|
||||
ret <4 x half> %tmp37
|
||||
}
|
||||
|
||||
define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 {
|
||||
%2 = extractelement <4 x i8> %0, i64 0
|
||||
%3 = uitofp i8 %2 to half
|
||||
%4 = insertelement <4 x half> undef, half %3, i32 0
|
||||
%5 = extractelement <4 x i8> %0, i64 1
|
||||
%6 = uitofp i8 %5 to half
|
||||
%7 = insertelement <4 x half> %4, half %6, i32 1
|
||||
%8 = extractelement <4 x i8> %0, i64 2
|
||||
%9 = uitofp i8 %8 to half
|
||||
%10 = insertelement <4 x half> %7, half %9, i32 2
|
||||
%11 = extractelement <4 x i8> %0, i64 3
|
||||
%12 = uitofp i8 %11 to half
|
||||
%13 = insertelement <4 x half> %10, half %12, i32 3
|
||||
ret <4 x half> %13
|
||||
; Function Attrs: norecurse nounwind readnone
|
||||
define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %arg) local_unnamed_addr #1 {
|
||||
bb:
|
||||
%tmp = extractelement <4 x i8> %arg, i64 0
|
||||
%tmp1 = uitofp i8 %tmp to half
|
||||
%tmp2 = insertelement <4 x half> undef, half %tmp1, i32 0
|
||||
%tmp3 = extractelement <4 x i8> %arg, i64 1
|
||||
%tmp4 = uitofp i8 %tmp3 to half
|
||||
%tmp5 = insertelement <4 x half> %tmp2, half %tmp4, i32 1
|
||||
%tmp6 = extractelement <4 x i8> %arg, i64 2
|
||||
%tmp7 = uitofp i8 %tmp6 to half
|
||||
%tmp8 = insertelement <4 x half> %tmp5, half %tmp7, i32 2
|
||||
%tmp9 = extractelement <4 x i8> %arg, i64 3
|
||||
%tmp10 = uitofp i8 %tmp9 to half
|
||||
%tmp11 = insertelement <4 x half> %tmp8, half %tmp10, i32 3
|
||||
ret <4 x half> %tmp11
|
||||
}
|
||||
|
||||
declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>)
|
||||
declare half @llvm.fmuladd.f16(half, half, half)
|
||||
|
||||
attributes #0 = { convergent nounwind readonly}
|
||||
attributes #1 = { norecurse nounwind readnone }
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s
|
||||
|
||||
declare half @llvm.fma.f16(half %a, half %b, half %c)
|
||||
declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
|
||||
declare <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
|
||||
|
||||
; GCN-LABEL: {{^}}fma_f16
|
||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
|
@ -13,7 +15,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
|
|||
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_f16(
|
||||
|
@ -38,8 +40,8 @@ define amdgpu_kernel void @fma_f16(
|
|||
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
|
||||
; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_f16_imm_a(
|
||||
|
@ -61,8 +63,8 @@ define amdgpu_kernel void @fma_f16_imm_a(
|
|||
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
|
||||
; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_f16_imm_b(
|
||||
|
@ -84,8 +86,8 @@ define amdgpu_kernel void @fma_f16_imm_b(
|
|||
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
|
||||
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
|
||||
; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_f16_imm_c(
|
||||
|
@ -127,9 +129,11 @@ define amdgpu_kernel void @fma_f16_imm_c(
|
|||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
|
||||
|
||||
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_v2f16(
|
||||
|
@ -150,14 +154,14 @@ define amdgpu_kernel void @fma_v2f16(
|
|||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
|
||||
|
||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
|
||||
|
||||
; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; SIVI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
|
||||
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||
|
@ -172,9 +176,11 @@ define amdgpu_kernel void @fma_v2f16(
|
|||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
|
||||
|
||||
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_v2f16_imm_a(
|
||||
|
@ -192,11 +198,11 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
|||
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
|
||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
|
@ -215,9 +221,11 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
|||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
|
||||
|
||||
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fma_v2f16_imm_b(
|
||||
|
@ -235,11 +243,11 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
|||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
|
||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
|
||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
|
@ -265,6 +273,7 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
|||
; GCN-NOT: and
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
||||
|
||||
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]]
|
||||
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -278,3 +287,74 @@ define amdgpu_kernel void @fma_v2f16_imm_c(
|
|||
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_v4f16
|
||||
; GCN: buffer_load_dwordx2 v{{\[}}[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]{{\]}}
|
||||
; GCN: buffer_load_dwordx2 v{{\[}}[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]{{\]}}
|
||||
; GCN: buffer_load_dwordx2 v{{\[}}[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]{{\]}}
|
||||
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V4_F16_HI]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_2:[0-9]+]], 16, v[[A_V4_F16_HI]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V4_F16_LO]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V4_F16_LO]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_V4_F16_HI]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_V4_F16_HI]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
|
||||
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[A_V4_F16_LO]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[A_V4_F16_HI]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_2:[0-9]+]], v[[B_V4_F16_LO]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_3:[0-9]+]], v[[B_V4_F16_HI]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_2:[0-9]+]], v[[C_V4_F16_LO]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_3:[0-9]+]], v[[C_V4_F16_HI]]
|
||||
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_2:[0-9]+]], v[[A_F32_2]], v[[B_F32_2]], v[[C_F32_2]]
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_3:[0-9]+]], v[[A_F32_3]], v[[B_F32_3]], v[[C_F32_3]]
|
||||
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_2:[0-9]+]], v[[R_F32_2]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[R_F32_3]]
|
||||
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_0:[0-9]]], 16, v[[R_F16_2]]
|
||||
; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_1:[0-9]]], 16, v[[R_F16_3]]
|
||||
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V4_F16_HI]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
|
||||
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
|
||||
; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[A_F16_0]], v[[B_F16_0]], v[[C_F16_0]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
|
||||
; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
|
||||
|
||||
; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]]
|
||||
; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]]
|
||||
|
||||
; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
|
||||
; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
|
||||
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[R_V4_F16_LO]]:[[R_V4_F16_HI]]{{\]}}
|
||||
; GCN: s_endpgm
|
||||
|
||||
define amdgpu_kernel void @fma_v4f16(
|
||||
<4 x half> addrspace(1)* %r,
|
||||
<4 x half> addrspace(1)* %a,
|
||||
<4 x half> addrspace(1)* %b,
|
||||
<4 x half> addrspace(1)* %c) {
|
||||
%a.val = load <4 x half>, <4 x half> addrspace(1)* %a
|
||||
%b.val = load <4 x half>, <4 x half> addrspace(1)* %b
|
||||
%c.val = load <4 x half>, <4 x half> addrspace(1)* %c
|
||||
%r.val = call <4 x half> @llvm.fma.v4f16(<4 x half> %a.val, <4 x half> %b.val, <4 x half> %c.val)
|
||||
store <4 x half> %r.val, <4 x half> addrspace(1)* %r
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue