forked from OSchip/llvm-project
[AMDGPU] Add intrinsics for 16 bit interpolation
Summary: Added the intrinsics llvm.amdgcn.interp.p1.f16() and llvm.amdgcn.interp.p2.f16() and related LIT test. The p1 intrinsic generates code appropriate for both 16 and 32 bank LDS. Reviewers: #amdgpu, dstuttard, arsenm, tpr Reviewed By: #amdgpu, arsenm Subscribers: jvesely, mgorny, arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D46754 llvm-svn: 352357
This commit is contained in:
parent
575c0855c0
commit
824ca3f3dd
|
@ -1165,6 +1165,20 @@ def int_amdgcn_interp_p2 :
|
|||
[IntrNoMem, IntrSpeculatable]>;
|
||||
// See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
|
||||
|
||||
// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
|
||||
def int_amdgcn_interp_p1_f16 :
|
||||
GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">,
|
||||
Intrinsic<[llvm_float_ty],
|
||||
[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrSpeculatable]>;
|
||||
|
||||
// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
|
||||
def int_amdgcn_interp_p2_f16 :
|
||||
GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">,
|
||||
Intrinsic<[llvm_half_ty],
|
||||
[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrSpeculatable]>;
|
||||
|
||||
// Pixel shaders only: whether the current pixel is live (i.e. not a helper
|
||||
// invocation for derivative computation).
|
||||
def int_amdgcn_ps_live : Intrinsic <
|
||||
|
|
|
@ -4183,6 +4183,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
NODE_NAME_CASE(INTERP_MOV)
|
||||
NODE_NAME_CASE(INTERP_P1)
|
||||
NODE_NAME_CASE(INTERP_P2)
|
||||
NODE_NAME_CASE(INTERP_P1LL_F16)
|
||||
NODE_NAME_CASE(INTERP_P1LV_F16)
|
||||
NODE_NAME_CASE(INTERP_P2_F16)
|
||||
NODE_NAME_CASE(STORE_MSKOR)
|
||||
NODE_NAME_CASE(LOAD_CONSTANT)
|
||||
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
|
||||
|
|
|
@ -462,6 +462,9 @@ enum NodeType : unsigned {
|
|||
INTERP_MOV,
|
||||
INTERP_P1,
|
||||
INTERP_P2,
|
||||
INTERP_P1LL_F16,
|
||||
INTERP_P1LV_F16,
|
||||
INTERP_P2_F16,
|
||||
PC_ADD_REL_OFFSET,
|
||||
KILL,
|
||||
DUMMY_CHAIN,
|
||||
|
|
|
@ -369,6 +369,17 @@ def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
|
|||
SDTypeProfile<1, 4, [SDTCisFP<0>]>,
|
||||
[SDNPInGlue]>;
|
||||
|
||||
def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
|
||||
SDTypeProfile<1, 7, [SDTCisFP<0>]>,
|
||||
[SDNPInGlue, SDNPOutGlue]>;
|
||||
|
||||
def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
|
||||
SDTypeProfile<1, 9, [SDTCisFP<0>]>,
|
||||
[SDNPInGlue, SDNPOutGlue]>;
|
||||
|
||||
def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
|
||||
SDTypeProfile<1, 8, [SDTCisFP<0>]>,
|
||||
[SDNPInGlue]>;
|
||||
|
||||
def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
|
||||
[SDNPHasChain, SDNPSideEffect]>;
|
||||
|
|
|
@ -48,6 +48,8 @@ def : SourceOfDivergence<int_amdgcn_workitem_id_z>;
|
|||
def : SourceOfDivergence<int_amdgcn_interp_mov>;
|
||||
def : SourceOfDivergence<int_amdgcn_interp_p1>;
|
||||
def : SourceOfDivergence<int_amdgcn_interp_p2>;
|
||||
def : SourceOfDivergence<int_amdgcn_interp_p1_f16>;
|
||||
def : SourceOfDivergence<int_amdgcn_interp_p2_f16>;
|
||||
def : SourceOfDivergence<int_amdgcn_mbcnt_hi>;
|
||||
def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
|
||||
def : SourceOfDivergence<int_r600_read_tidig_x>;
|
||||
|
|
|
@ -5292,6 +5292,59 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
|
||||
Glue);
|
||||
}
|
||||
case Intrinsic::amdgcn_interp_p1_f16: {
|
||||
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
|
||||
SDValue Glue = M0.getValue(1);
|
||||
if (getSubtarget()->getLDSBankCount() == 16) {
|
||||
// 16 bank LDS
|
||||
SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
|
||||
DAG.getConstant(2, DL, MVT::i32), // P0
|
||||
Op.getOperand(2), // Attrchan
|
||||
Op.getOperand(3), // Attr
|
||||
Glue);
|
||||
SDValue Ops[] = {
|
||||
Op.getOperand(1), // Src0
|
||||
Op.getOperand(2), // Attrchan
|
||||
Op.getOperand(3), // Attr
|
||||
DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
|
||||
S, // Src2 - holds two f16 values selected by high
|
||||
DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
|
||||
Op.getOperand(4), // high
|
||||
DAG.getConstant(0, DL, MVT::i1), // $clamp
|
||||
DAG.getConstant(0, DL, MVT::i32) // $omod
|
||||
};
|
||||
return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
|
||||
} else {
|
||||
// 32 bank LDS
|
||||
SDValue Ops[] = {
|
||||
Op.getOperand(1), // Src0
|
||||
Op.getOperand(2), // Attrchan
|
||||
Op.getOperand(3), // Attr
|
||||
DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
|
||||
Op.getOperand(4), // high
|
||||
DAG.getConstant(0, DL, MVT::i1), // $clamp
|
||||
DAG.getConstant(0, DL, MVT::i32), // $omod
|
||||
Glue
|
||||
};
|
||||
return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
|
||||
}
|
||||
}
|
||||
case Intrinsic::amdgcn_interp_p2_f16: {
|
||||
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
|
||||
SDValue Glue = SDValue(M0.getNode(), 1);
|
||||
SDValue Ops[] = {
|
||||
Op.getOperand(2), // Src0
|
||||
Op.getOperand(3), // Attrchan
|
||||
Op.getOperand(4), // Attr
|
||||
DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
|
||||
Op.getOperand(1), // Src2
|
||||
DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
|
||||
Op.getOperand(5), // high
|
||||
DAG.getConstant(0, DL, MVT::i1), // $clamp
|
||||
Glue
|
||||
};
|
||||
return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
|
||||
}
|
||||
case Intrinsic::amdgcn_sin:
|
||||
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
|
||||
|
||||
|
|
|
@ -462,7 +462,14 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL
|
|||
let FPDPRounding = 1 in {
|
||||
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
|
||||
let Uses = [M0, EXEC] in {
|
||||
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
|
||||
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
|
||||
[(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
|
||||
(i32 imm:$attr),
|
||||
(i32 imm:$src0_modifiers),
|
||||
(f32 VRegSrc_32:$src2),
|
||||
(i32 imm:$src2_modifiers),
|
||||
(i1 imm:$high),
|
||||
(i1 imm:$clamp)))]>;
|
||||
} // End Uses = [M0, EXEC]
|
||||
} // End FPDPRounding = 1
|
||||
} // End renamedInGFX9 = 1
|
||||
|
@ -477,8 +484,22 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
|
|||
} // End SubtargetPredicate = isGFX9
|
||||
|
||||
let Uses = [M0, EXEC], FPDPRounding = 1 in {
|
||||
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
|
||||
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
|
||||
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
|
||||
[(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
|
||||
(i32 imm:$attr),
|
||||
(i32 imm:$src0_modifiers),
|
||||
(i1 imm:$high),
|
||||
(i1 imm:$clamp),
|
||||
(i32 imm:$omod)))]>;
|
||||
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
|
||||
[(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
|
||||
(i32 imm:$attr),
|
||||
(i32 imm:$src0_modifiers),
|
||||
(f32 VRegSrc_32:$src2),
|
||||
(i32 imm:$src2_modifiers),
|
||||
(i1 imm:$high),
|
||||
(i1 imm:$clamp),
|
||||
(i32 imm:$omod)))]>;
|
||||
} // End Uses = [M0, EXEC], FPDPRounding = 1
|
||||
|
||||
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
|
||||
|
||||
; CHECK: for function 'interp_p1_f16'
|
||||
; CHECK: DIVERGENT: %p1 = call float @llvm.amdgcn.interp.p1.f16
|
||||
define amdgpu_ps float @interp_p1_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
|
||||
main_body:
|
||||
%p1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
|
||||
ret float %p1
|
||||
}
|
||||
|
||||
; CHECK: for function 'interp_p2_f16'
|
||||
; CHECK: DIVERGENT: %p2 = call half @llvm.amdgcn.interp.p2.f16
|
||||
define amdgpu_ps half @interp_p2_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
|
||||
main_body:
|
||||
%p2 = call half @llvm.amdgcn.interp.p2.f16(float %i, float %j, i32 1, i32 2, i1 0, i32 %m0)
|
||||
ret half %p2
|
||||
}
|
||||
|
||||
; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0)
|
||||
declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0
|
||||
; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0)
|
||||
declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
|
||||
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
|
@ -0,0 +1,187 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-32BANK %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-32BANK %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-16BANK %s
|
||||
|
||||
define amdgpu_ps half @interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
|
||||
; GFX9-32BANK-LABEL: interp_f16:
|
||||
; GFX9-32BANK: ; %bb.0: ; %main_body
|
||||
; GFX9-32BANK-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v1, v0, attr2.y
|
||||
; GFX9-32BANK-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y high
|
||||
; GFX9-32BANK-NEXT: v_interp_p2_legacy_f16 v1, v2, attr2.y, v1
|
||||
; GFX9-32BANK-NEXT: v_interp_p2_legacy_f16 v0, v2, attr2.y, v0 high
|
||||
; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX9-32BANK-NEXT: v_add_f16_e32 v0, v1, v0
|
||||
; GFX9-32BANK-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-32BANK-LABEL: interp_f16:
|
||||
; GFX8-32BANK: ; %bb.0: ; %main_body
|
||||
; GFX8-32BANK-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v1, v0, attr2.y
|
||||
; GFX8-32BANK-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y high
|
||||
; GFX8-32BANK-NEXT: v_interp_p2_f16 v1, v2, attr2.y, v1
|
||||
; GFX8-32BANK-NEXT: v_interp_p2_f16 v0, v2, attr2.y, v0 high
|
||||
; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX8-32BANK-NEXT: v_add_f16_e32 v0, v1, v0
|
||||
; GFX8-32BANK-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-16BANK-LABEL: interp_f16:
|
||||
; GFX8-16BANK: ; %bb.0: ; %main_body
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-16BANK-NEXT: v_interp_mov_f32_e32 v0, p0, attr2.y
|
||||
; GFX8-16BANK-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v2, v1, attr2.y, v0
|
||||
; GFX8-16BANK-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v0, v1, attr2.y, v0 high
|
||||
; GFX8-16BANK-NEXT: v_interp_p2_f16 v2, v3, attr2.y, v2
|
||||
; GFX8-16BANK-NEXT: v_interp_p2_f16 v0, v3, attr2.y, v0 high
|
||||
; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX8-16BANK-NEXT: v_add_f16_e32 v0, v2, v0
|
||||
; GFX8-16BANK-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
%p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
|
||||
%p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0)
|
||||
%p1_1 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 1, i32 %m0)
|
||||
%p2_1 = call half @llvm.amdgcn.interp.p2.f16(float %p1_1, float %j, i32 1, i32 2, i1 1, i32 %m0)
|
||||
%res = fadd half %p2_0, %p2_1
|
||||
ret half %res
|
||||
}
|
||||
|
||||
; check that m0 is setup correctly before the interp p1 instruction
|
||||
define amdgpu_ps half @interp_p1_m0_setup(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
|
||||
; GFX9-32BANK-LABEL: interp_p1_m0_setup:
|
||||
; GFX9-32BANK: ; %bb.0: ; %main_body
|
||||
; GFX9-32BANK-NEXT: ;;#ASMSTART
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 m0, 0
|
||||
; GFX9-32BANK-NEXT: ;;#ASMEND
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 s3, m0
|
||||
; GFX9-32BANK-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y
|
||||
; GFX9-32BANK-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-32BANK-NEXT: v_interp_p2_legacy_f16 v0, v1, attr2.y, v0
|
||||
; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX9-32BANK-NEXT: v_add_f16_e32 v0, s3, v0
|
||||
; GFX9-32BANK-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-32BANK-LABEL: interp_p1_m0_setup:
|
||||
; GFX8-32BANK: ; %bb.0: ; %main_body
|
||||
; GFX8-32BANK-NEXT: ;;#ASMSTART
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 m0, 0
|
||||
; GFX8-32BANK-NEXT: ;;#ASMEND
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 s3, m0
|
||||
; GFX8-32BANK-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y
|
||||
; GFX8-32BANK-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-32BANK-NEXT: v_interp_p2_f16 v0, v1, attr2.y, v0
|
||||
; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX8-32BANK-NEXT: v_add_f16_e32 v0, s3, v0
|
||||
; GFX8-32BANK-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-16BANK-LABEL: interp_p1_m0_setup:
|
||||
; GFX8-16BANK: ; %bb.0: ; %main_body
|
||||
; GFX8-16BANK-NEXT: ;;#ASMSTART
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 m0, 0
|
||||
; GFX8-16BANK-NEXT: ;;#ASMEND
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 s3, m0
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-16BANK-NEXT: v_interp_mov_f32_e32 v0, p0, attr2.y
|
||||
; GFX8-16BANK-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v0, v1, attr2.y, v0
|
||||
; GFX8-16BANK-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-16BANK-NEXT: v_interp_p2_f16 v0, v1, attr2.y, v0
|
||||
; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX8-16BANK-NEXT: v_add_f16_e32 v0, s3, v0
|
||||
; GFX8-16BANK-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
%mx = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
|
||||
%p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
|
||||
%p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0)
|
||||
%my = trunc i32 %mx to i16
|
||||
%mh = bitcast i16 %my to half
|
||||
%res = fadd half %p2_0, %mh
|
||||
ret half %res
|
||||
}
|
||||
|
||||
; check that m0 is setup correctly before the interp p2 instruction
|
||||
define amdgpu_ps half @interp_p2_m0_setup(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
|
||||
; GFX9-32BANK-LABEL: interp_p2_m0_setup:
|
||||
; GFX9-32BANK: ; %bb.0: ; %main_body
|
||||
; GFX9-32BANK-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y
|
||||
; GFX9-32BANK-NEXT: ;;#ASMSTART
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 m0, 0
|
||||
; GFX9-32BANK-NEXT: ;;#ASMEND
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 s0, m0
|
||||
; GFX9-32BANK-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX9-32BANK-NEXT: v_interp_p2_legacy_f16 v0, v1, attr2.y, v0
|
||||
; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX9-32BANK-NEXT: v_add_f16_e32 v0, s0, v0
|
||||
; GFX9-32BANK-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-32BANK-LABEL: interp_p2_m0_setup:
|
||||
; GFX8-32BANK: ; %bb.0: ; %main_body
|
||||
; GFX8-32BANK-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y
|
||||
; GFX8-32BANK-NEXT: ;;#ASMSTART
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 m0, 0
|
||||
; GFX8-32BANK-NEXT: ;;#ASMEND
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 s0, m0
|
||||
; GFX8-32BANK-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-32BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-32BANK-NEXT: v_interp_p2_f16 v0, v1, attr2.y, v0
|
||||
; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX8-32BANK-NEXT: v_add_f16_e32 v0, s0, v0
|
||||
; GFX8-32BANK-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-16BANK-LABEL: interp_p2_m0_setup:
|
||||
; GFX8-16BANK: ; %bb.0: ; %main_body
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-16BANK-NEXT: v_interp_mov_f32_e32 v0, p0, attr2.y
|
||||
; GFX8-16BANK-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
|
||||
; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v0, v1, attr2.y, v0
|
||||
; GFX8-16BANK-NEXT: ;;#ASMSTART
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 m0, 0
|
||||
; GFX8-16BANK-NEXT: ;;#ASMEND
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 s0, m0
|
||||
; GFX8-16BANK-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-16BANK-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8-16BANK-NEXT: v_interp_p2_f16 v0, v1, attr2.y, v0
|
||||
; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
|
||||
; GFX8-16BANK-NEXT: v_add_f16_e32 v0, s0, v0
|
||||
; GFX8-16BANK-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
%p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 0, i32 %m0)
|
||||
%mx = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
|
||||
%p2_0 = call half @llvm.amdgcn.interp.p2.f16(float %p1_0, float %j, i32 1, i32 2, i1 0, i32 %m0)
|
||||
%my = trunc i32 %mx to i16
|
||||
%mh = bitcast i16 %my to half
|
||||
%res = fadd half %p2_0, %mh
|
||||
ret half %res
|
||||
}
|
||||
|
||||
; float @llvm.amdgcn.interp.p1.f16(i, attrchan, attr, high, m0)
|
||||
declare float @llvm.amdgcn.interp.p1.f16(float, i32, i32, i1, i32) #0
|
||||
; half @llvm.amdgcn.interp.p1.f16(p1, j, attrchan, attr, high, m0)
|
||||
declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
|
||||
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
Loading…
Reference in New Issue