[AMDGPU] Unify intrinsic ret/nortn interface

We have a single noret intrinsic an a lot of special handling
around it. Declare it just as any other but do not define rtn
instructions itself instead.

Differential Revision: https://reviews.llvm.org/D87719
This commit is contained in:
Stanislav Mekhanoshin 2020-09-10 15:10:52 -07:00
parent 7b4cc0961b
commit 277de43d88
27 changed files with 421 additions and 362 deletions

View File

@ -1012,7 +1012,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
AMDGPURsrcIntrinsic<2, 0>;
// gfx908 intrinsic
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = 0> : Intrinsic <
!if(NoRtn, [], [data_ty]),
@ -1049,7 +1049,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
AMDGPURsrcIntrinsic<2, 0>;
// gfx908 intrinsic
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
// Obsolescent tbuffer intrinsics.
@ -1181,6 +1181,19 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
AMDGPURsrcIntrinsic<2, 0>;
def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
class AMDGPUBufferAtomicFP : Intrinsic <
[llvm_anyfloat_ty],
[LLVMMatchType<0>, // vdata(VGPR)
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(SGPR/VGPR/imm)
llvm_i1_ty], // slc(imm)
[ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1, 0>;
// Legacy form of the intrinsic. raw and struct forms should be preferred.
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
} // defset AMDGPUBufferIntrinsics
// Uses that do not set the done bit should set IntrWriteMem on the
@ -1800,27 +1813,7 @@ def int_amdgcn_udot8 :
// gfx908 intrinsics
// ===----------------------------------------------------------------------===//
class AMDGPUBufferAtomicNoRtn : Intrinsic <
[],
[llvm_anyfloat_ty, // vdata(VGPR)
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(SGPR/VGPR/imm)
llvm_i1_ty], // slc(imm)
[ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1, 0>;
class AMDGPUGlobalAtomicNoRtn : Intrinsic <
[],
[llvm_anyptr_ty, // vaddr
llvm_anyfloat_ty], // vdata(VGPR)
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
[SDNPMemOperand]>;
def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn;
// Legacy form of the intrinsic. raw and struct forms should be preferred.
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn;
def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
// llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">,

View File

@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@ -1743,6 +1744,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, false);
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_global_atomic_fadd:
return selectGlobalAtomicFaddIntrinsic(I);
default: {
return selectImpl(I, *CoverageInfo);
}
@ -2899,6 +2902,123 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
return true;
}
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
MachineInstr &MI) const {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
Function &F = MBB->getParent()->getFunction();
DiagnosticInfoUnsupported
NoFpRet(F, "return versions of fp atomics not supported",
MI.getDebugLoc(), DS_Error);
F.getContext().diagnose(NoFpRet);
return false;
}
// FIXME: This is only needed because tablegen requires number of dst operands
// in match and replace pattern to be the same. Otherwise patterns can be
// exported from SDag path.
MachineOperand &VDataIn = MI.getOperand(1);
MachineOperand &VIndex = MI.getOperand(3);
MachineOperand &VOffset = MI.getOperand(4);
MachineOperand &SOffset = MI.getOperand(5);
int16_t Offset = MI.getOperand(6).getImm();
bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
unsigned Opcode;
if (HasVOffset) {
Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
: AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
} else {
Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
: AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
}
if (MRI->getType(VDataIn.getReg()).isVector()) {
switch (Opcode) {
case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
break;
case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
break;
case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
break;
case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
break;
}
}
auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
I.add(VDataIn);
if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
.addReg(VIndex.getReg())
.addImm(AMDGPU::sub0)
.addReg(VOffset.getReg())
.addImm(AMDGPU::sub1);
I.addReg(IdxReg);
} else if (HasVIndex) {
I.add(VIndex);
} else if (HasVOffset) {
I.add(VOffset);
}
I.add(MI.getOperand(2)); // rsrc
I.add(SOffset);
I.addImm(Offset);
renderExtractSLC(I, MI, 7);
I.cloneMemRefs(MI);
MI.eraseFromParent();
return true;
}
bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
MachineInstr &MI) const{
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
Function &F = MBB->getParent()->getFunction();
DiagnosticInfoUnsupported
NoFpRet(F, "return versions of fp atomics not supported",
MI.getDebugLoc(), DS_Error);
F.getContext().diagnose(NoFpRet);
return false;
}
// FIXME: This is only needed because tablegen requires number of dst operands
// in match and replace pattern to be the same. Otherwise patterns can be
// exported from SDag path.
auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
Register Data = MI.getOperand(3).getReg();
const unsigned Opc = MRI->getType(Data).isVector() ?
AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
.addReg(Addr.first)
.addReg(Data)
.addImm(Addr.second)
.addImm(0) // SLC
.cloneMemRefs(MI);
MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
@ -3018,6 +3138,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
assert(Intr && "not an image intrinsic with image pseudo");
return selectImageIntrinsic(I, Intr);
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
default:
return selectImpl(I, *CoverageInfo);
}
@ -3260,14 +3382,11 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
}
template <bool Signed>
InstructionSelector::ComplexRendererFns
std::pair<Register, int>
AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
InstructionSelector::ComplexRendererFns Default = {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset
}};
auto Default = std::make_pair(Root.getReg(), 0);
if (!STI.hasFlatInstOffsets())
return Default;
@ -3287,20 +3406,27 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
Register BasePtr = OpDef->getOperand(1).getReg();
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
}};
return std::make_pair(BasePtr, Offset.getValue());
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
return selectFlatOffsetImpl<false>(Root);
auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
return selectFlatOffsetImpl<true>(Root);
auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
}};
}
/// Match a zero extend from a 32-bit value to 64-bits.

View File

@ -141,6 +141,8 @@ private:
bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
std::pair<Register, unsigned>
selectVOP3ModsImpl(MachineOperand &Root) const;
@ -180,11 +182,11 @@ private:
selectSmrdSgpr(MachineOperand &Root) const;
template <bool Signed>
InstructionSelector::ComplexRendererFns
std::pair<Register, int>
selectFlatOffsetImpl(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectFlatOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectFlatOffsetSigned(MachineOperand &Root) const;

View File

@ -483,6 +483,8 @@ defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
let MemoryVT = v2f16 in
defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,

View File

@ -750,6 +750,9 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
for (MachineInstr &MI : Range) {
for (MachineOperand &Def : MI.defs()) {
if (MRI.use_nodbg_empty(Def.getReg()))
continue;
LLT ResTy = MRI.getType(Def.getReg());
const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
ResultRegs.push_back(Def.getReg());
@ -2971,7 +2974,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {1, 4});
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
@ -3929,7 +3932,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
// vdata_out
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
@ -3952,23 +3956,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// initialized.
break;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
// vdata_in
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
// rsrc
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
// vindex
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
// voffset
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
// soffset
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
// vdata_out
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

View File

@ -1094,14 +1094,12 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
int_amdgcn_buffer_wbinvl1>;
let SubtargetPredicate = HasAtomicFaddInsts in {
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
"buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_noret_32
>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
//===----------------------------------------------------------------------===//
@ -1394,36 +1392,46 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7),
(vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)),
[{ return SDValue(N, 0).use_empty(); }]> {
let GISelPredicateCode = [{
return MRI.use_nodbg_empty(MI.getOperand(0).getReg());
}];
}
multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, 0,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (extract_slc $cachepolicy))
(as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (extract_slc $cachepolicy))
(as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, 0,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (extract_slc $cachepolicy))
(as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),

View File

@ -78,6 +78,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
let OtherPredicates = ps.OtherPredicates;
let TSFlags = ps.TSFlags;
let UseNamedOperandTable = ps.UseNamedOperandTable;
@ -714,16 +715,16 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
let is_flat_global = 1 in {
let OtherPredicates = [HasAtomicFaddInsts] in {
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_add_f32", VGPR_32, f32
>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
} // End OtherPredicates = [HasAtomicFaddInsts]
} // End is_flat_global = 1
//===----------------------------------------------------------------------===//
// Flat Patterns
@ -1081,8 +1082,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>;
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>;
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_fadd_global_noret, v2f16>;
let OtherPredicates = [HasAtomicFaddInsts] in {
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_load_fadd_global_noret_32, f32>;
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
}
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10

View File

@ -1121,7 +1121,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_buffer_atomic_fadd: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_VOID;
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
Info.ptrVal = MFI->getBufferPSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
@ -1135,18 +1135,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_global_atomic_fadd: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
->getPointerElementType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
// FIXME: Should report an atomic ordering here.
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@ -1171,6 +1159,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_global_atomic_fadd: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@ -7034,7 +7033,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_buffer_atomic_or:
case Intrinsic::amdgcn_buffer_atomic_xor: {
case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_buffer_atomic_fadd: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
unsigned IdxEn = 1;
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
@ -7094,6 +7094,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_xor:
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
if (!Op.getValue(0).use_empty()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
DL.getDebugLoc(), DS_Error);
DAG.getContext()->diagnose(NoFpRet);
return SDValue();
}
Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
break;
default:
llvm_unreachable("unhandled atomic opcode");
}
@ -7101,6 +7112,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
@ -7226,6 +7241,27 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
case Intrinsic::amdgcn_global_atomic_fadd: {
if (!Op.getValue(0).use_empty()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
DL.getDebugLoc(), DS_Error);
DAG.getContext()->diagnose(NoFpRet);
return SDValue();
}
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
M->getOperand(2), // Ptr
M->getOperand(3) // Value
};
EVT VT = Op.getOperand(3).getValueType();
return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
DAG.getVTList(VT, MVT::Other), Ops,
M->getMemOperand());
}
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
@ -7547,39 +7583,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_buffer_atomic_fadd: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
unsigned IdxEn = 1;
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Chain,
Op.getOperand(2), // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
// We don't know the offset if vindex is non-zero, so clear it.
if (IdxEn)
Offset = 0;
EVT VT = Op.getOperand(2).getValueType();
auto *M = cast<MemSDNode>(Op);
M->getMemOperand()->setOffset(Offset);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_FADD, DL,
Op->getVTList(), Ops, VT,
M->getMemOperand());
}
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);

View File

@ -173,18 +173,6 @@ class SDBufferAtomic<string opcode> : SDNode <opcode,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
class SDBufferAtomicNoRtn<string opcode> : SDNode <opcode,
SDTypeProfile<0, 8,
[SDTCisVT<1, v4i32>, // rsrc
SDTCisVT<2, i32>, // vindex(VGPR)
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // cachepolicy(imm)
SDTCisVT<7, i1>]>, // idxen(imm)
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
@ -198,7 +186,7 @@ def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@ -316,18 +304,6 @@ defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
} // End let AddressSpaces = ...
} // End foreach AddrSpace
def atomic_fadd_global_noret_impl : PatFrag<
(ops node:$ptr, node:$value),
(atomic_load_fadd node:$ptr, node:$value)> {
// FIXME: Move this
let MemoryVT = f32;
let IsAtomic = 1;
let AddressSpaces = StoreAddress_global.AddrSpaces;
}
def atomic_fadd_global_noret : PatFrags<(ops node:$src0, node:$src1),
[(int_amdgcn_global_atomic_fadd node:$src0, node:$src1),
(atomic_fadd_global_noret_impl node:$src0, node:$src1)]>;
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.

View File

@ -2435,7 +2435,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction<1/*NoRtn*/>;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);

View File

@ -0,0 +1,10 @@
; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0
; GFX908: error: {{.*}} return versions of fp atomics not supported
define float @global_atomic_fadd_f32_rtn(float addrspace(1)* %ptr, float %data) {
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
ret float %ret
}

View File

@ -8,7 +8,7 @@ define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) {
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
ret void
}
@ -26,7 +26,7 @@ define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %da
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
ret void
}
@ -44,7 +44,7 @@ define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, float addrspace(1)* %ptr, i64 -511
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
ret void
}
@ -62,7 +62,7 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_endpgm
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
ret void
}
@ -73,7 +73,7 @@ define void @global_atomic_fadd_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half>
; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
ret void
}
@ -91,11 +91,11 @@ define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr,
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data)
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data)
ret void
}
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #0
declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
attributes #0 = { argmemonly nounwind willreturn }

View File

@ -16,7 +16,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -35,7 +35,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4095
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
}
@ -52,7 +52,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
ret void
}
@ -70,7 +70,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
@ -117,7 +117,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
; CHECK: bb.4:
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -162,7 +162,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
; CHECK: bb.4:
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
@ -181,7 +181,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
%voffset = add i32 %voffset.base, 4095
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -200,7 +200,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@ -218,7 +218,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -235,11 +235,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
attributes #0 = { nounwind }

View File

@ -0,0 +1,11 @@
; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
; GFX908: error: {{.*}} return versions of fp atomics not supported
define amdgpu_ps float @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
main_body:
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}

View File

@ -18,7 +18,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4095
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret void
}
@ -57,7 +57,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
ret void
}
@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@ -126,7 +126,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
; CHECK: bb.4:
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -173,7 +173,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
; CHECK: bb.4:
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@ -194,7 +194,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@ -212,7 +212,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
ret void
}
@ -232,7 +232,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -250,11 +250,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
attributes #0 = { nounwind }

View File

@ -58,14 +58,12 @@ body: |
; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -105,14 +103,12 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)

View File

@ -1961,16 +1961,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3, %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2013,16 +2009,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GREEDY: bb.2:
; GREEDY: successors: %bb.3, %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2074,16 +2066,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3, %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2127,16 +2115,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GREEDY: bb.2:
; GREEDY: successors: %bb.3, %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2186,16 +2170,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3, %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2239,16 +2219,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GREEDY: bb.2:
; GREEDY: successors: %bb.3, %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2297,16 +2273,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3, %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2349,16 +2321,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GREEDY: bb.2:
; GREEDY: successors: %bb.3, %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2407,16 +2375,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3, %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2459,16 +2423,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GREEDY: bb.2:
; GREEDY: successors: %bb.3, %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2517,16 +2477,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3, %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2569,16 +2525,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GREEDY: bb.2:
; GREEDY: successors: %bb.3, %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2626,16 +2578,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: bb.2:
; CHECK: successors: %bb.3, %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
@ -2677,16 +2625,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; GREEDY: bb.2:
; GREEDY: successors: %bb.3, %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)

View File

@ -15,27 +15,27 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
@ -49,13 +49,13 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 112, align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
@ -64,7 +64,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
@ -73,7 +73,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
@ -82,7 +82,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
; GCN: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
@ -101,7 +101,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
; GCN: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
@ -110,7 +110,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
; GCN: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
@ -119,7 +119,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
; GCN: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
@ -135,7 +135,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
; GCN: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
@ -150,7 +150,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
@ -164,7 +164,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN: [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
@ -193,7 +193,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
; GCN: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
; GCN: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
@ -207,7 +207,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
; GCN: INLINEASM &"", 1
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
; GCN: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
@ -268,10 +268,10 @@ bb.0:
call void asm sideeffect "", "" ()
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2
%fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2
%fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
%fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2
%fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2
call void asm sideeffect "", "" ()
@ -392,7 +392,7 @@ declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2
declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2
declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2
declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2

View File

@ -68,7 +68,6 @@ done:
declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2
attributes #0 = { argmemonly nounwind }
attributes #1 = { nounwind readnone willreturn }

View File

@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
@ -9,14 +8,14 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
; OPT-NEXT: entry:
; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
; OPT: if:
; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)*
; OPT-NEXT: call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4
; OPT-NEXT: br label [[ENDIF]]
; OPT: endif:
@ -57,7 +56,7 @@ entry:
br i1 %cmp, label %endif, label %if
if:
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
%fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
%val = load volatile float, float addrspace(1)* undef
br label %endif
@ -71,7 +70,7 @@ done:
}
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #2
attributes #0 = { argmemonly nounwind }
attributes #1 = { nounwind readnone willreturn }

View File

@ -8,12 +8,12 @@
; have the instruction available.
; FIXME: Should also really make sure the v2f16 version fails.
; FAIL: LLVM ERROR: Cannot select: {{.+}}: ch = BUFFER_ATOMIC_FADD
; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD
define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) {
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false)
%ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false)
ret void
}
declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0
declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0
attributes #0 = { nounwind }

View File

@ -1,12 +1,12 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,CAS %s
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,CAS %s
; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32:
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]
; GCN: v_add_f32_e32
; GCN: global_atomic_cmpswap
; GCN: s_andn2_b64 exec, exec,
; GCN-NEXT: s_cbranch_execnz [[LOOP]]
; CAS: [[LOOP:BB[0-9]+_[0-9]+]]
; CAS: v_add_f32_e32
; CAS: global_atomic_cmpswap
; CAS: s_andn2_b64 exec, exec,
; CAS-NEXT: s_cbranch_execnz [[LOOP]]
define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) {
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
store float %result, float addrspace(1)* undef

View File

@ -15,7 +15,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(i8 addrspace(1)* inreg %sbase
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data)
ret void
}
@ -28,7 +28,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(i8 addrspace(1)* inreg
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data)
ret void
}
@ -40,7 +40,7 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(i8 addrspace(1)* inreg %sba
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data)
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data)
ret void
}
@ -53,11 +53,11 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(i8 addrspace(1)* inr
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data)
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data)
ret void
}
declare void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0
declare void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
attributes #0 = { argmemonly nounwind willreturn }

View File

@ -1,15 +1,15 @@
; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN
declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
declare void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)*, float)
declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
; GCN-LABEL: {{^}}buffer_atomic_add_f32:
; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen
define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
%ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
ret void
}
@ -17,7 +17,7 @@ main_body:
; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc
define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
%ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
ret void
}
@ -25,7 +25,7 @@ main_body:
; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen
define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
%ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
ret void
}
@ -33,7 +33,7 @@ main_body:
; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc
define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
main_body:
call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
%ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
ret void
}
@ -41,7 +41,7 @@ main_body:
; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) {
main_body:
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
ret void
}
@ -50,7 +50,7 @@ main_body:
define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) {
main_body:
%p = getelementptr float, float addrspace(1)* %ptr, i64 1
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
ret void
}
@ -59,7 +59,7 @@ main_body:
define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) {
main_body:
%p = getelementptr float, float addrspace(1)* %ptr, i64 -1
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
ret void
}
@ -67,7 +67,7 @@ main_body:
; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off
define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
main_body:
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
ret void
}
@ -76,7 +76,7 @@ main_body:
define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
main_body:
%p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
ret void
}
@ -85,7 +85,7 @@ main_body:
define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
main_body:
%p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
ret void
}
@ -94,7 +94,7 @@ main_body:
; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget:
; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 {
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data)
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
ret void
}

View File

@ -10,7 +10,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret void
}
@ -23,7 +23,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s6
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -49,7 +49,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
ret void
}
@ -62,11 +62,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
}
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
attributes #0 = { nounwind }

View File

@ -11,7 +11,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@ -25,7 +25,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@ -38,7 +38,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@ -51,11 +51,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
; CHECK-NEXT: s_mov_b32 s8, s2
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen
; CHECK-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
attributes #0 = { nounwind }

View File

@ -29,12 +29,12 @@ define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64
%cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to float addrspace(1)*
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %castback, float 100.0)
call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %castback, float 100.0)
store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4
ret void
}
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #1
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #1
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind willreturn }