diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index d5f793bc1233..416a441d9267 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -635,6 +635,51 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { + if (STI.getLDSBankCount() != 16) + return selectImpl(MI, *CoverageInfo); + + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + Register M0Val = MI.getOperand(6).getReg(); + if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || + !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || + !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) + return false; + + // This requires 2 instructions. It is possible to write a pattern to support + // this, but the generated isel emitter doesn't correctly deal with multiple + // output instructions using the same physical register input. The copy to m0 + // is incorrectly placed before the second instruction. + // + // TODO: Match source modifiers. + + Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0Val); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) + .addImm(2) + .addImm(MI.getOperand(4).getImm()) // $attr + .addImm(MI.getOperand(3).getImm()); // $attrchan + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) + .addImm(0) // $src0_modifiers + .addReg(Src0) // $src0 + .addImm(MI.getOperand(4).getImm()) // $attr + .addImm(MI.getOperand(3).getImm()) // $attrchan + .addImm(0) // $src2_modifiers + .addReg(InterpMov) // $src2 - 2 f16 values selected by high + .addImm(MI.getOperand(5).getImm()) // $high + .addImm(0) // $clamp + .addImm(0); // $omod + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { unsigned IntrinsicID = I.getIntrinsicID(); switch (IntrinsicID) { @@ -659,6 +704,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return true; } + case Intrinsic::amdgcn_interp_p1_f16: + return selectInterpP1F16(I); default: return selectImpl(I, *CoverageInfo); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c41d3e28bea4..923f8bdba15c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -94,6 +94,8 @@ private: bool selectG_PTR_ADD(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; bool selectG_INSERT(MachineInstr &I) const; + + bool selectInterpP1F16(MachineInstr &MI) const; bool selectG_INTRINSIC(MachineInstr &I) const; std::tuple diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll new file mode 100644 index 000000000000..92a0dd574874 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.p1.f16.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-32BANK %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-32BANK %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8-16BANK %s + +define amdgpu_ps float @interp_f16(float %i, i32 inreg %m0) #0 { +; GFX9-32BANK-LABEL: interp_f16: +; GFX9-32BANK: ; %bb.0: +; GFX9-32BANK-NEXT: s_mov_b32 m0, s0 +; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y +; GFX9-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-32BANK-LABEL: interp_f16: +; GFX8-32BANK: ; %bb.0: +; GFX8-32BANK-NEXT: s_mov_b32 m0, s0 +; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y +; GFX8-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-16BANK-LABEL: interp_f16: +; GFX8-16BANK: ; %bb.0: +; GFX8-16BANK-NEXT: s_mov_b32 m0, s0 +; GFX8-16BANK-NEXT: v_interp_mov_f32_e32 v1, p0, attr2.y +; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v0, v0, attr2.y, v1 +; GFX8-16BANK-NEXT: ; return to shader part epilog + %res = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 false, i32 %m0) + ret float %res +} + +define amdgpu_ps float @interp_f16_high(float %i, i32 inreg %m0) #0 { +; GFX9-32BANK-LABEL: interp_f16_high: +; GFX9-32BANK: ; %bb.0: +; GFX9-32BANK-NEXT: s_mov_b32 m0, s0 +; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y high +; GFX9-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-32BANK-LABEL: interp_f16_high: +; GFX8-32BANK: ; %bb.0: +; GFX8-32BANK-NEXT: s_mov_b32 m0, s0 +; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y high +; GFX8-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-16BANK-LABEL: interp_f16_high: +; GFX8-16BANK: ; %bb.0: +; GFX8-16BANK-NEXT: s_mov_b32 m0, s0 +; GFX8-16BANK-NEXT: v_interp_mov_f32_e32 v1, p0, attr2.y +; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v0, v0, attr2.y, v1 high +; GFX8-16BANK-NEXT: ; return to shader part epilog + %res = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 true, i32 %m0) + ret float %res +} + +define amdgpu_ps float @interp_f16_0_0(float %i, i32 inreg %m0) #0 { +; GFX9-32BANK-LABEL: interp_f16_0_0: +; GFX9-32BANK: ; %bb.0: +; GFX9-32BANK-NEXT: s_mov_b32 m0, s0 +; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr0.x +; GFX9-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-32BANK-LABEL: interp_f16_0_0: +; GFX8-32BANK: ; %bb.0: +; GFX8-32BANK-NEXT: s_mov_b32 m0, s0 +; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr0.x +; GFX8-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-16BANK-LABEL: interp_f16_0_0: +; GFX8-16BANK: ; %bb.0: +; GFX8-16BANK-NEXT: s_mov_b32 m0, s0 +; GFX8-16BANK-NEXT: v_interp_mov_f32_e32 v1, p0, attr0.x +; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v0, v0, attr0.x, v1 +; GFX8-16BANK-NEXT: ; return to shader part epilog + %res = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 0, i32 0, i1 false, i32 %m0) + ret float %res +} + +; Copy needed to legalize %i +define amdgpu_ps float @interp_f16_sgpr_i(float inreg %i,i32 inreg %m0) #0 { +; GFX9-32BANK-LABEL: interp_f16_sgpr_i: +; GFX9-32BANK: ; %bb.0: +; GFX9-32BANK-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-32BANK-NEXT: s_mov_b32 m0, s1 +; GFX9-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX9-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y +; GFX9-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-32BANK-LABEL: interp_f16_sgpr_i: +; GFX8-32BANK: ; %bb.0: +; GFX8-32BANK-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-32BANK-NEXT: s_mov_b32 m0, s1 +; GFX8-32BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-32BANK-NEXT: v_interp_p1ll_f16 v0, v0, attr2.y +; GFX8-32BANK-NEXT: ; return to shader part epilog +; +; GFX8-16BANK-LABEL: interp_f16_sgpr_i: +; GFX8-16BANK: ; %bb.0: +; GFX8-16BANK-NEXT: s_mov_b32 m0, s1 +; GFX8-16BANK-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-16BANK-NEXT: v_interp_mov_f32_e32 v1, p0, attr2.y +; GFX8-16BANK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 +; GFX8-16BANK-NEXT: v_interp_p1lv_f16 v0, v0, attr2.y, v1 +; GFX8-16BANK-NEXT: ; return to shader part epilog + %res = call float @llvm.amdgcn.interp.p1.f16(float %i, i32 1, i32 2, i1 false, i32 %m0) + ret float %res +} + +declare float @llvm.amdgcn.interp.p1.f16(float, i32 immarg, i32 immarg, i1 immarg, i32) #0 + +attributes #0 = { nounwind readnone speculatable }