forked from OSchip/llvm-project
[AMDGPU] gfx11 Select on Buffer Atomic FAdd Rtn type
Reviewed By: #amdgpu, foad, rampitec Differential Revision: https://reviews.llvm.org/D128205
This commit is contained in:
parent
94ed2caf70
commit
ae72fee74e
|
@ -7,8 +7,10 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPUGlobalISelUtils.h"
|
||||
#include "GCNSubtarget.h"
|
||||
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/Support/LowLevelTypeImpl.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace MIPatternMatch;
|
||||
|
@ -66,3 +68,12 @@ bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
|
|||
return true;
|
||||
return (Mask[0] & 2) == (Mask[1] & 2);
|
||||
}
|
||||
|
||||
bool AMDGPU::hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget,
|
||||
const LLT &Ty) {
|
||||
if (Ty == LLT::scalar(32))
|
||||
return Subtarget.hasAtomicFaddRtnInsts();
|
||||
if (Ty == LLT::fixed_vector(2, 16) || Ty == LLT::scalar(64))
|
||||
return Subtarget.hasGFX90AInsts();
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
namespace llvm {
|
||||
|
||||
class MachineRegisterInfo;
|
||||
class GCNSubtarget;
|
||||
class LLT;
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
|
@ -24,7 +26,7 @@ std::pair<Register, unsigned>
|
|||
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
|
||||
|
||||
bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
|
||||
|
||||
bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2994,13 +2994,15 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
|
|||
|
||||
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
|
||||
MachineInstr &MI) const {
|
||||
if (STI.hasGFX90AInsts())
|
||||
const Register DefReg = MI.getOperand(0).getReg();
|
||||
LLT DefTy = MRI->getType(DefReg);
|
||||
if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy))
|
||||
return selectImpl(MI, *CoverageInfo);
|
||||
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
|
||||
if (!MRI->use_nodbg_empty(DefReg)) {
|
||||
Function &F = MBB->getParent()->getFunction();
|
||||
DiagnosticInfoUnsupported
|
||||
NoFpRet(F, "return versions of fp atomics not supported",
|
||||
|
|
|
@ -5738,7 +5738,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
|||
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
|
||||
if (!MRI.use_empty(DstReg) &&
|
||||
!AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
|
||||
Function &F = B.getMF().getFunction();
|
||||
DiagnosticInfoUnsupported NoFpRet(
|
||||
F, "return versions of fp atomics not supported", B.getDebugLoc(),
|
||||
|
|
|
@ -4362,6 +4362,18 @@ bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
|
|||
return isTypeLegal(VT.getScalarType());
|
||||
}
|
||||
|
||||
bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
|
||||
switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
|
||||
case MVT::f32:
|
||||
return Subtarget->hasAtomicFaddRtnInsts();
|
||||
case MVT::v2f16:
|
||||
case MVT::f64:
|
||||
return Subtarget->hasGFX90AInsts();
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
|
||||
// This currently forces unfolding various combinations of fsub into fma with
|
||||
// free fneg'd operands. As long as we have fast FMA (controlled by
|
||||
|
@ -7399,7 +7411,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_fadd:
|
||||
if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
|
||||
if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) {
|
||||
DiagnosticInfoUnsupported
|
||||
NoFpRet(DAG.getMachineFunction().getFunction(),
|
||||
"return versions of fp atomics not supported",
|
||||
|
@ -12623,7 +12635,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
|
|||
return AtomicExpansionKind::CmpXChg;
|
||||
|
||||
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
|
||||
Subtarget->hasAtomicFaddInsts()) {
|
||||
Subtarget->hasAtomicFaddNoRtnInsts()) {
|
||||
if (Subtarget->hasGFX940Insts())
|
||||
return AtomicExpansionKind::None;
|
||||
|
||||
|
|
|
@ -394,6 +394,7 @@ public:
|
|||
MachineBasicBlock *BB) const override;
|
||||
|
||||
bool hasBitPreservingFPLogic(EVT VT) const override;
|
||||
bool hasAtomicFaddRtnForTy(SDValue &Op) const;
|
||||
bool enableAggressiveFMAFusion(EVT VT) const override;
|
||||
bool enableAggressiveFMAFusion(LLT Ty) const override;
|
||||
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
|
||||
|
||||
; no-rtn
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET
|
||||
define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
|
||||
define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
|
||||
define amdgpu_ps void @xstruct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; rtn
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
|
||||
define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_OFFEN
|
||||
define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_OFFSET
|
||||
define amdgpu_ps float @raw_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
|
||||
define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_IDXEN
|
||||
define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
|
||||
define amdgpu_ps float @struct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
; GFX11: BUFFER_ATOMIC_ADD_F32_BOTHEN
|
||||
define amdgpu_ps float @xstruct_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
attributes #0 = { nounwind }
|
Loading…
Reference in New Issue