forked from OSchip/llvm-project
[AMDGPU] Unify intrinsic ret/nortn interface
We have a single noret intrinsic an a lot of special handling around it. Declare it just as any other but do not define rtn instructions itself instead. Differential Revision: https://reviews.llvm.org/D87719
This commit is contained in:
parent
7b4cc0961b
commit
277de43d88
|
@ -1012,7 +1012,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
|
|||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
// gfx908 intrinsic
|
||||
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
|
||||
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
|
||||
|
||||
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = 0> : Intrinsic <
|
||||
!if(NoRtn, [], [data_ty]),
|
||||
|
@ -1049,7 +1049,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
|
|||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
// gfx908 intrinsic
|
||||
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty, /*NoRtn*/1>;
|
||||
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
|
||||
|
||||
|
||||
// Obsolescent tbuffer intrinsics.
|
||||
|
@ -1181,6 +1181,19 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
|
|||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
|
||||
|
||||
class AMDGPUBufferAtomicFP : Intrinsic <
|
||||
[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>, // vdata(VGPR)
|
||||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(SGPR/VGPR/imm)
|
||||
llvm_i1_ty], // slc(imm)
|
||||
[ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1, 0>;
|
||||
|
||||
// Legacy form of the intrinsic. raw and struct forms should be preferred.
|
||||
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
|
||||
} // defset AMDGPUBufferIntrinsics
|
||||
|
||||
// Uses that do not set the done bit should set IntrWriteMem on the
|
||||
|
@ -1800,27 +1813,7 @@ def int_amdgcn_udot8 :
|
|||
// gfx908 intrinsics
|
||||
// ===----------------------------------------------------------------------===//
|
||||
|
||||
class AMDGPUBufferAtomicNoRtn : Intrinsic <
|
||||
[],
|
||||
[llvm_anyfloat_ty, // vdata(VGPR)
|
||||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(SGPR/VGPR/imm)
|
||||
llvm_i1_ty], // slc(imm)
|
||||
[ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1, 0>;
|
||||
|
||||
class AMDGPUGlobalAtomicNoRtn : Intrinsic <
|
||||
[],
|
||||
[llvm_anyptr_ty, // vaddr
|
||||
llvm_anyfloat_ty], // vdata(VGPR)
|
||||
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
|
||||
[SDNPMemOperand]>;
|
||||
|
||||
def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn;
|
||||
|
||||
// Legacy form of the intrinsic. raw and struct forms should be preferred.
|
||||
def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn;
|
||||
def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
|
||||
|
||||
// llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
|
||||
def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">,
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/IR/DiagnosticInfo.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
@ -1743,6 +1744,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
|
|||
return selectDSAppendConsume(I, false);
|
||||
case Intrinsic::amdgcn_s_barrier:
|
||||
return selectSBarrier(I);
|
||||
case Intrinsic::amdgcn_global_atomic_fadd:
|
||||
return selectGlobalAtomicFaddIntrinsic(I);
|
||||
default: {
|
||||
return selectImpl(I, *CoverageInfo);
|
||||
}
|
||||
|
@ -2899,6 +2902,123 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
|
|||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
|
||||
MachineInstr &MI) const {
|
||||
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
|
||||
Function &F = MBB->getParent()->getFunction();
|
||||
DiagnosticInfoUnsupported
|
||||
NoFpRet(F, "return versions of fp atomics not supported",
|
||||
MI.getDebugLoc(), DS_Error);
|
||||
F.getContext().diagnose(NoFpRet);
|
||||
return false;
|
||||
}
|
||||
|
||||
// FIXME: This is only needed because tablegen requires number of dst operands
|
||||
// in match and replace pattern to be the same. Otherwise patterns can be
|
||||
// exported from SDag path.
|
||||
MachineOperand &VDataIn = MI.getOperand(1);
|
||||
MachineOperand &VIndex = MI.getOperand(3);
|
||||
MachineOperand &VOffset = MI.getOperand(4);
|
||||
MachineOperand &SOffset = MI.getOperand(5);
|
||||
int16_t Offset = MI.getOperand(6).getImm();
|
||||
|
||||
bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
|
||||
bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
|
||||
|
||||
unsigned Opcode;
|
||||
if (HasVOffset) {
|
||||
Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
|
||||
: AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
|
||||
} else {
|
||||
Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
|
||||
: AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
|
||||
}
|
||||
|
||||
if (MRI->getType(VDataIn.getReg()).isVector()) {
|
||||
switch (Opcode) {
|
||||
case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
|
||||
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
|
||||
break;
|
||||
case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
|
||||
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
|
||||
break;
|
||||
case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
|
||||
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
|
||||
break;
|
||||
case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
|
||||
Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
|
||||
I.add(VDataIn);
|
||||
|
||||
if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
|
||||
Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
|
||||
Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
|
||||
BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
|
||||
.addReg(VIndex.getReg())
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(VOffset.getReg())
|
||||
.addImm(AMDGPU::sub1);
|
||||
|
||||
I.addReg(IdxReg);
|
||||
} else if (HasVIndex) {
|
||||
I.add(VIndex);
|
||||
} else if (HasVOffset) {
|
||||
I.add(VOffset);
|
||||
}
|
||||
|
||||
I.add(MI.getOperand(2)); // rsrc
|
||||
I.add(SOffset);
|
||||
I.addImm(Offset);
|
||||
renderExtractSLC(I, MI, 7);
|
||||
I.cloneMemRefs(MI);
|
||||
|
||||
MI.eraseFromParent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
|
||||
MachineInstr &MI) const{
|
||||
|
||||
MachineBasicBlock *MBB = MI.getParent();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
|
||||
Function &F = MBB->getParent()->getFunction();
|
||||
DiagnosticInfoUnsupported
|
||||
NoFpRet(F, "return versions of fp atomics not supported",
|
||||
MI.getDebugLoc(), DS_Error);
|
||||
F.getContext().diagnose(NoFpRet);
|
||||
return false;
|
||||
}
|
||||
|
||||
// FIXME: This is only needed because tablegen requires number of dst operands
|
||||
// in match and replace pattern to be the same. Otherwise patterns can be
|
||||
// exported from SDag path.
|
||||
auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
|
||||
|
||||
Register Data = MI.getOperand(3).getReg();
|
||||
const unsigned Opc = MRI->getType(Data).isVector() ?
|
||||
AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
|
||||
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
|
||||
.addReg(Addr.first)
|
||||
.addReg(Data)
|
||||
.addImm(Addr.second)
|
||||
.addImm(0) // SLC
|
||||
.cloneMemRefs(MI);
|
||||
|
||||
MI.eraseFromParent();
|
||||
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
|
||||
if (I.isPHI())
|
||||
return selectPHI(I);
|
||||
|
@ -3018,6 +3138,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
|
|||
assert(Intr && "not an image intrinsic with image pseudo");
|
||||
return selectImageIntrinsic(I, Intr);
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
|
||||
return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
|
||||
default:
|
||||
return selectImpl(I, *CoverageInfo);
|
||||
}
|
||||
|
@ -3260,14 +3382,11 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
|
|||
}
|
||||
|
||||
template <bool Signed>
|
||||
InstructionSelector::ComplexRendererFns
|
||||
std::pair<Register, int>
|
||||
AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
|
||||
MachineInstr *MI = Root.getParent();
|
||||
|
||||
InstructionSelector::ComplexRendererFns Default = {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset
|
||||
}};
|
||||
auto Default = std::make_pair(Root.getReg(), 0);
|
||||
|
||||
if (!STI.hasFlatInstOffsets())
|
||||
return Default;
|
||||
|
@ -3287,20 +3406,27 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
|
|||
|
||||
Register BasePtr = OpDef->getOperand(1).getReg();
|
||||
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
|
||||
}};
|
||||
return std::make_pair(BasePtr, Offset.getValue());
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
|
||||
return selectFlatOffsetImpl<false>(Root);
|
||||
auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
|
||||
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
|
||||
}};
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
|
||||
return selectFlatOffsetImpl<true>(Root);
|
||||
auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
|
||||
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
|
||||
}};
|
||||
}
|
||||
|
||||
/// Match a zero extend from a 32-bit value to 64-bits.
|
||||
|
|
|
@ -141,6 +141,8 @@ private:
|
|||
bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
|
||||
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
|
||||
bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
|
||||
bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const;
|
||||
bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const;
|
||||
|
||||
std::pair<Register, unsigned>
|
||||
selectVOP3ModsImpl(MachineOperand &Root) const;
|
||||
|
@ -180,11 +182,11 @@ private:
|
|||
selectSmrdSgpr(MachineOperand &Root) const;
|
||||
|
||||
template <bool Signed>
|
||||
InstructionSelector::ComplexRendererFns
|
||||
std::pair<Register, int>
|
||||
selectFlatOffsetImpl(MachineOperand &Root) const;
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectFlatOffset(MachineOperand &Root) const;
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectFlatOffsetSigned(MachineOperand &Root) const;
|
||||
|
||||
|
|
|
@ -483,6 +483,8 @@ defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
|
|||
defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
|
||||
defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
|
||||
defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
|
||||
let MemoryVT = v2f16 in
|
||||
defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
|
||||
defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
|
||||
|
||||
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
|
||||
|
|
|
@ -750,6 +750,9 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
|
|||
|
||||
for (MachineInstr &MI : Range) {
|
||||
for (MachineOperand &Def : MI.defs()) {
|
||||
if (MRI.use_nodbg_empty(Def.getReg()))
|
||||
continue;
|
||||
|
||||
LLT ResTy = MRI.getType(Def.getReg());
|
||||
const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
|
||||
ResultRegs.push_back(Def.getReg());
|
||||
|
@ -2971,7 +2974,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
|||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
executeInWaterfallLoop(MI, MRI, {1, 4});
|
||||
executeInWaterfallLoop(MI, MRI, {2, 5});
|
||||
return;
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
|
||||
|
@ -3929,7 +3932,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
|||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
|
||||
// vdata_out
|
||||
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
|
||||
|
@ -3952,23 +3956,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
|||
// initialized.
|
||||
break;
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: {
|
||||
// vdata_in
|
||||
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
|
||||
// rsrc
|
||||
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
|
||||
|
||||
// vindex
|
||||
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
|
||||
|
||||
// voffset
|
||||
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
|
||||
|
||||
// soffset
|
||||
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
|
||||
break;
|
||||
}
|
||||
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
|
||||
// vdata_out
|
||||
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
|
|
|
@ -1094,14 +1094,12 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
|
|||
int_amdgcn_buffer_wbinvl1>;
|
||||
|
||||
let SubtargetPredicate = HasAtomicFaddInsts in {
|
||||
|
||||
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
|
||||
"buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
|
||||
"buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_noret_32
|
||||
>;
|
||||
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
|
||||
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret
|
||||
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32
|
||||
>;
|
||||
|
||||
} // End SubtargetPredicate = HasAtomicFaddInsts
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -1394,36 +1392,46 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
|
|||
defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
|
||||
|
||||
class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
|
||||
(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7),
|
||||
(vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)),
|
||||
[{ return SDValue(N, 0).use_empty(); }]> {
|
||||
|
||||
let GISelPredicateCode = [{
|
||||
return MRI.use_nodbg_empty(MI.getOperand(0).getReg());
|
||||
}];
|
||||
}
|
||||
|
||||
multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
|
||||
string opcode> {
|
||||
def : GCNPat<
|
||||
(name vt:$vdata_in, v4i32:$rsrc, 0,
|
||||
0, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, 0),
|
||||
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
|
||||
0, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, 0),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
|
||||
0, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, timm),
|
||||
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
|
||||
0, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, timm),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
(name vt:$vdata_in, v4i32:$rsrc, 0,
|
||||
i32:$voffset, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, 0),
|
||||
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
|
||||
i32:$voffset, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, 0),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
(as_i16timm $offset), (extract_slc $cachepolicy))
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
|
||||
i32:$voffset, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, timm),
|
||||
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
|
||||
i32:$voffset, i32:$soffset, timm:$offset,
|
||||
timm:$cachepolicy, timm),
|
||||
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
|
||||
getVregSrcForVT<vt>.ret:$vdata_in,
|
||||
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
|
||||
|
|
|
@ -78,6 +78,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
|
|||
// copy relevant pseudo op flags
|
||||
let SubtargetPredicate = ps.SubtargetPredicate;
|
||||
let AsmMatchConverter = ps.AsmMatchConverter;
|
||||
let OtherPredicates = ps.OtherPredicates;
|
||||
let TSFlags = ps.TSFlags;
|
||||
let UseNamedOperandTable = ps.UseNamedOperandTable;
|
||||
|
||||
|
@ -714,16 +715,16 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
|
|||
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
|
||||
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
|
||||
|
||||
let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
|
||||
|
||||
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
|
||||
"global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
|
||||
>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
|
||||
"global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret
|
||||
>;
|
||||
|
||||
} // End SubtargetPredicate = HasAtomicFaddInsts
|
||||
let is_flat_global = 1 in {
|
||||
let OtherPredicates = [HasAtomicFaddInsts] in {
|
||||
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
|
||||
"global_atomic_add_f32", VGPR_32, f32
|
||||
>;
|
||||
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
|
||||
"global_atomic_pk_add_f16", VGPR_32, v2f16
|
||||
>;
|
||||
} // End OtherPredicates = [HasAtomicFaddInsts]
|
||||
} // End is_flat_global = 1
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Flat Patterns
|
||||
|
@ -1081,8 +1082,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64
|
|||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>;
|
||||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>;
|
||||
|
||||
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>;
|
||||
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_fadd_global_noret, v2f16>;
|
||||
let OtherPredicates = [HasAtomicFaddInsts] in {
|
||||
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_load_fadd_global_noret_32, f32>;
|
||||
defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>;
|
||||
}
|
||||
|
||||
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
|
||||
|
||||
|
|
|
@ -1121,7 +1121,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|||
case Intrinsic::amdgcn_buffer_atomic_fadd: {
|
||||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
Info.opc = ISD::INTRINSIC_VOID;
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
|
||||
Info.ptrVal = MFI->getBufferPSV(
|
||||
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
|
||||
|
@ -1135,18 +1135,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|||
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_global_atomic_fadd: {
|
||||
Info.opc = ISD::INTRINSIC_VOID;
|
||||
Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
|
||||
->getPointerElementType());
|
||||
Info.ptrVal = CI.getOperand(0);
|
||||
Info.align.reset();
|
||||
|
||||
// FIXME: Should report an atomic ordering here.
|
||||
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
|
||||
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_ds_append:
|
||||
case Intrinsic::amdgcn_ds_consume: {
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
|
@ -1171,6 +1159,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|||
MachineMemOperand::MOVolatile;
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_global_atomic_fadd: {
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(CI.getType());
|
||||
Info.ptrVal = CI.getOperand(0);
|
||||
Info.align.reset();
|
||||
Info.flags = MachineMemOperand::MOLoad |
|
||||
MachineMemOperand::MOStore |
|
||||
MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOVolatile;
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_ds_gws_init:
|
||||
case Intrinsic::amdgcn_ds_gws_barrier:
|
||||
case Intrinsic::amdgcn_ds_gws_sema_v:
|
||||
|
@ -7034,7 +7033,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
case Intrinsic::amdgcn_buffer_atomic_umax:
|
||||
case Intrinsic::amdgcn_buffer_atomic_and:
|
||||
case Intrinsic::amdgcn_buffer_atomic_or:
|
||||
case Intrinsic::amdgcn_buffer_atomic_xor: {
|
||||
case Intrinsic::amdgcn_buffer_atomic_xor:
|
||||
case Intrinsic::amdgcn_buffer_atomic_fadd: {
|
||||
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
|
||||
unsigned IdxEn = 1;
|
||||
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
|
||||
|
@ -7094,6 +7094,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
case Intrinsic::amdgcn_buffer_atomic_xor:
|
||||
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_fadd:
|
||||
if (!Op.getValue(0).use_empty()) {
|
||||
DiagnosticInfoUnsupported
|
||||
NoFpRet(DAG.getMachineFunction().getFunction(),
|
||||
"return versions of fp atomics not supported",
|
||||
DL.getDebugLoc(), DS_Error);
|
||||
DAG.getContext()->diagnose(NoFpRet);
|
||||
return SDValue();
|
||||
}
|
||||
Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("unhandled atomic opcode");
|
||||
}
|
||||
|
@ -7101,6 +7112,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
|
||||
M->getMemOperand());
|
||||
}
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
||||
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
|
||||
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
|
||||
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_add:
|
||||
|
@ -7226,6 +7241,27 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
|
||||
Op->getVTList(), Ops, VT, M->getMemOperand());
|
||||
}
|
||||
case Intrinsic::amdgcn_global_atomic_fadd: {
|
||||
if (!Op.getValue(0).use_empty()) {
|
||||
DiagnosticInfoUnsupported
|
||||
NoFpRet(DAG.getMachineFunction().getFunction(),
|
||||
"return versions of fp atomics not supported",
|
||||
DL.getDebugLoc(), DS_Error);
|
||||
DAG.getContext()->diagnose(NoFpRet);
|
||||
return SDValue();
|
||||
}
|
||||
MemSDNode *M = cast<MemSDNode>(Op);
|
||||
SDValue Ops[] = {
|
||||
M->getOperand(0), // Chain
|
||||
M->getOperand(2), // Ptr
|
||||
M->getOperand(3) // Value
|
||||
};
|
||||
|
||||
EVT VT = Op.getOperand(3).getValueType();
|
||||
return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
|
||||
DAG.getVTList(VT, MVT::Other), Ops,
|
||||
M->getMemOperand());
|
||||
}
|
||||
default:
|
||||
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
||||
AMDGPU::getImageDimIntrinsicInfo(IntrID))
|
||||
|
@ -7547,39 +7583,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|||
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
|
||||
M->getMemoryVT(), M->getMemOperand());
|
||||
}
|
||||
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
|
||||
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
|
||||
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
|
||||
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
|
||||
case Intrinsic::amdgcn_buffer_atomic_fadd: {
|
||||
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
|
||||
unsigned IdxEn = 1;
|
||||
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
|
||||
IdxEn = Idx->getZExtValue() != 0;
|
||||
SDValue Ops[] = {
|
||||
Chain,
|
||||
Op.getOperand(2), // vdata
|
||||
Op.getOperand(3), // rsrc
|
||||
Op.getOperand(4), // vindex
|
||||
SDValue(), // voffset -- will be set by setBufferOffsets
|
||||
SDValue(), // soffset -- will be set by setBufferOffsets
|
||||
SDValue(), // offset -- will be set by setBufferOffsets
|
||||
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
|
||||
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
|
||||
};
|
||||
unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
|
||||
// We don't know the offset if vindex is non-zero, so clear it.
|
||||
if (IdxEn)
|
||||
Offset = 0;
|
||||
EVT VT = Op.getOperand(2).getValueType();
|
||||
|
||||
auto *M = cast<MemSDNode>(Op);
|
||||
M->getMemOperand()->setOffset(Offset);
|
||||
|
||||
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_FADD, DL,
|
||||
Op->getVTList(), Ops, VT,
|
||||
M->getMemOperand());
|
||||
}
|
||||
case Intrinsic::amdgcn_end_cf:
|
||||
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
|
||||
Op->getOperand(2), Chain), 0);
|
||||
|
|
|
@ -173,18 +173,6 @@ class SDBufferAtomic<string opcode> : SDNode <opcode,
|
|||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
|
||||
>;
|
||||
|
||||
class SDBufferAtomicNoRtn<string opcode> : SDNode <opcode,
|
||||
SDTypeProfile<0, 8,
|
||||
[SDTCisVT<1, v4i32>, // rsrc
|
||||
SDTCisVT<2, i32>, // vindex(VGPR)
|
||||
SDTCisVT<3, i32>, // voffset(VGPR)
|
||||
SDTCisVT<4, i32>, // soffset(SGPR)
|
||||
SDTCisVT<5, i32>, // offset(imm)
|
||||
SDTCisVT<6, i32>, // cachepolicy(imm)
|
||||
SDTCisVT<7, i1>]>, // idxen(imm)
|
||||
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
|
||||
>;
|
||||
|
||||
def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
|
||||
def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
|
||||
def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
|
||||
|
@ -198,7 +186,7 @@ def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
|
|||
def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
|
||||
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
|
||||
def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
|
||||
def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
|
||||
def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
|
||||
|
||||
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
|
||||
SDTypeProfile<1, 9,
|
||||
|
@ -316,18 +304,6 @@ defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
|
|||
} // End let AddressSpaces = ...
|
||||
} // End foreach AddrSpace
|
||||
|
||||
def atomic_fadd_global_noret_impl : PatFrag<
|
||||
(ops node:$ptr, node:$value),
|
||||
(atomic_load_fadd node:$ptr, node:$value)> {
|
||||
// FIXME: Move this
|
||||
let MemoryVT = f32;
|
||||
let IsAtomic = 1;
|
||||
let AddressSpaces = StoreAddress_global.AddrSpaces;
|
||||
}
|
||||
|
||||
def atomic_fadd_global_noret : PatFrags<(ops node:$src0, node:$src1),
|
||||
[(int_amdgcn_global_atomic_fadd node:$src0, node:$src1),
|
||||
(atomic_fadd_global_noret_impl node:$src0, node:$src1)]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SDNodes PatFrags for loads/stores with a glue input.
|
||||
|
|
|
@ -2435,7 +2435,7 @@ def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
|
|||
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction<1/*NoRtn*/>;
|
||||
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
|
||||
|
||||
def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
|
||||
let OutOperandList = (outs type0:$dst);
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
|
||||
|
||||
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0
|
||||
|
||||
; GFX908: error: {{.*}} return versions of fp atomics not supported
|
||||
|
||||
define float @global_atomic_fadd_f32_rtn(float addrspace(1)* %ptr, float %data) {
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
ret float %ret
|
||||
}
|
|
@ -8,7 +8,7 @@ define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) {
|
|||
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,7 @@ define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %da
|
|||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -44,7 +44,7 @@ define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float
|
|||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 -511
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -62,7 +62,7 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
|
|||
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
|
||||
; GFX908-NEXT: s_endpgm
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -73,7 +73,7 @@ define void @global_atomic_fadd_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half>
|
|||
; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
|
||||
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -91,11 +91,11 @@ define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr,
|
|||
; GFX908-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data)
|
||||
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #0
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
|
||||
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0
|
||||
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
|
||||
|
||||
attributes #0 = { argmemonly nounwind willreturn }
|
||||
|
|
|
@ -16,7 +16,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -35,7 +35,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
|
|||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -70,7 +70,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -117,7 +117,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp
|
|||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -162,7 +162,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v
|
|||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -181,7 +181,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
|
|||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
%voffset = add i32 %voffset.base, 4095
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -200,7 +200,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -218,7 +218,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -235,11 +235,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
|
||||
|
||||
declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
|
||||
; GFX908: error: {{.*}} return versions of fp atomics not supported
|
||||
|
||||
define amdgpu_ps float @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
|
||||
main_body:
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret float %ret
|
||||
}
|
|
@ -18,7 +18,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4095
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -57,7 +57,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -126,7 +126,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
|
|||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -173,7 +173,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
|
|||
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; CHECK: bb.4:
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -194,7 +194,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -212,7 +212,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -232,7 +232,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
|
|||
; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -250,11 +250,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
|
|||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
|
||||
; CHECK: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; CHECK: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -58,14 +58,12 @@ body: |
|
|||
; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
|
||||
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: .1:
|
||||
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -105,14 +103,12 @@ body: |
|
|||
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
|
||||
; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: .1:
|
||||
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.1
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
|
|
@ -1961,16 +1961,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
|
|||
; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
|
||||
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3, %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2013,16 +2009,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
|
|||
; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
|
||||
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; GREEDY: bb.2:
|
||||
; GREEDY: successors: %bb.3, %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2074,16 +2066,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
|
|||
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
|
||||
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3, %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2127,16 +2115,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
|
|||
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
|
||||
; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; GREEDY: bb.2:
|
||||
; GREEDY: successors: %bb.3, %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
|
||||
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2186,16 +2170,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
|
|||
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
|
||||
; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3, %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2239,16 +2219,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
|
|||
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
|
||||
; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; GREEDY: bb.2:
|
||||
; GREEDY: successors: %bb.3, %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2
|
||||
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2297,16 +2273,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
|
|||
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
|
||||
; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
|
||||
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3, %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2349,16 +2321,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
|
|||
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
|
||||
; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
|
||||
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; GREEDY: bb.2:
|
||||
; GREEDY: successors: %bb.3, %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2407,16 +2375,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
|
|||
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
|
||||
; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
|
||||
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3, %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2459,16 +2423,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
|
|||
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
|
||||
; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
|
||||
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; GREEDY: bb.2:
|
||||
; GREEDY: successors: %bb.3, %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2517,16 +2477,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
|
|||
; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
|
||||
; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
|
||||
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3, %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2569,16 +2525,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
|
|||
; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
|
||||
; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
|
||||
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; GREEDY: bb.2:
|
||||
; GREEDY: successors: %bb.3, %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2626,16 +2578,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
|
|||
; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; CHECK: bb.2:
|
||||
; CHECK: successors: %bb.3, %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
@ -2677,16 +2625,12 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
|
|||
; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
|
||||
; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
|
||||
; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
|
||||
; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
|
||||
; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
|
||||
; GREEDY: bb.2:
|
||||
; GREEDY: successors: %bb.3, %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2
|
||||
; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2
|
||||
; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec
|
||||
; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec
|
||||
; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
|
||||
|
|
|
@ -15,27 +15,27 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
|
||||
; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
|
||||
|
@ -49,13 +49,13 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 112, align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
|
||||
|
@ -64,7 +64,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
|
||||
|
@ -73,7 +73,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
|
||||
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
|
||||
|
@ -82,7 +82,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
|
||||
; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
|
||||
; GCN: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
|
||||
|
@ -101,7 +101,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
|
||||
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
|
||||
|
@ -110,7 +110,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
|
||||
; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
|
||||
|
@ -119,7 +119,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
|
||||
|
@ -135,7 +135,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
|
||||
|
@ -150,7 +150,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
|
||||
; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
|
@ -164,7 +164,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GCN: [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
|
||||
|
@ -193,7 +193,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
|
||||
; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
|
||||
; GCN: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
|
@ -207,7 +207,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
|
|||
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
|
||||
; GCN: INLINEASM &"", 1
|
||||
; GCN: INLINEASM &"", 1 /* sideeffect attdialect */
|
||||
; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
|
||||
; GCN: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
|
||||
|
@ -268,10 +268,10 @@ bb.0:
|
|||
|
||||
call void asm sideeffect "", "" ()
|
||||
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2
|
||||
%fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2
|
||||
%fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
|
||||
%fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2
|
||||
%fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2
|
||||
|
||||
call void asm sideeffect "", "" ()
|
||||
|
||||
|
@ -392,7 +392,7 @@ declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i
|
|||
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
|
||||
declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2
|
||||
declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2
|
||||
declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2
|
||||
declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2
|
||||
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
|
||||
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
|
||||
declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2
|
||||
|
|
|
@ -68,7 +68,6 @@ done:
|
|||
|
||||
declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2
|
||||
|
||||
attributes #0 = { argmemonly nounwind }
|
||||
attributes #1 = { nounwind readnone willreturn }
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
|
@ -9,14 +8,14 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
|
|||
; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
|
||||
; OPT-NEXT: entry:
|
||||
; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
|
||||
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3
|
||||
; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]]
|
||||
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
|
||||
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
|
||||
; OPT: if:
|
||||
; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
|
||||
; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
|
||||
; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)*
|
||||
; OPT-NEXT: call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
|
||||
; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
|
||||
; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4
|
||||
; OPT-NEXT: br label [[ENDIF]]
|
||||
; OPT: endif:
|
||||
|
@ -57,7 +56,7 @@ entry:
|
|||
br i1 %cmp, label %endif, label %if
|
||||
|
||||
if:
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
|
||||
%fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
|
||||
%val = load volatile float, float addrspace(1)* undef
|
||||
br label %endif
|
||||
|
||||
|
@ -71,7 +70,7 @@ done:
|
|||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2
|
||||
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #2
|
||||
|
||||
attributes #0 = { argmemonly nounwind }
|
||||
attributes #1 = { nounwind readnone willreturn }
|
||||
|
|
|
@ -8,12 +8,12 @@
|
|||
; have the instruction available.
|
||||
; FIXME: Should also really make sure the v2f16 version fails.
|
||||
|
||||
; FAIL: LLVM ERROR: Cannot select: {{.+}}: ch = BUFFER_ATOMIC_FADD
|
||||
; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD
|
||||
define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) {
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false)
|
||||
%ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0
|
||||
declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,CAS %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,CAS %s
|
||||
|
||||
; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32:
|
||||
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]
|
||||
; GCN: v_add_f32_e32
|
||||
; GCN: global_atomic_cmpswap
|
||||
; GCN: s_andn2_b64 exec, exec,
|
||||
; GCN-NEXT: s_cbranch_execnz [[LOOP]]
|
||||
; CAS: [[LOOP:BB[0-9]+_[0-9]+]]
|
||||
; CAS: v_add_f32_e32
|
||||
; CAS: global_atomic_cmpswap
|
||||
; CAS: s_andn2_b64 exec, exec,
|
||||
; CAS-NEXT: s_cbranch_execnz [[LOOP]]
|
||||
define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) {
|
||||
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
|
||||
store float %result, float addrspace(1)* undef
|
||||
|
|
|
@ -15,7 +15,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(i8 addrspace(1)* inreg %sbase
|
|||
%zext.offset = zext i32 %voffset to i64
|
||||
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||||
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
|
||||
call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -28,7 +28,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(i8 addrspace(1)* inreg
|
|||
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||||
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||||
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
|
||||
call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -40,7 +40,7 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(i8 addrspace(1)* inreg %sba
|
|||
%zext.offset = zext i32 %voffset to i64
|
||||
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||||
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
|
||||
call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data)
|
||||
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -53,11 +53,11 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(i8 addrspace(1)* inr
|
|||
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||||
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||||
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
|
||||
call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data)
|
||||
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
|
||||
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0
|
||||
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
|
||||
|
||||
attributes #0 = { argmemonly nounwind willreturn }
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN
|
||||
|
||||
declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
|
||||
declare void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)*, float)
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
|
||||
declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
|
||||
declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
|
||||
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
|
||||
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
|
||||
|
||||
; GCN-LABEL: {{^}}buffer_atomic_add_f32:
|
||||
; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen
|
||||
define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
|
||||
%ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@ main_body:
|
|||
; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc
|
||||
define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
|
||||
%ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ main_body:
|
|||
; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen
|
||||
define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,7 @@ main_body:
|
|||
; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc
|
||||
define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
|
||||
%ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -41,7 +41,7 @@ main_body:
|
|||
; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
|
||||
define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -50,7 +50,7 @@ main_body:
|
|||
define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) {
|
||||
main_body:
|
||||
%p = getelementptr float, float addrspace(1)* %ptr, i64 1
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -59,7 +59,7 @@ main_body:
|
|||
define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) {
|
||||
main_body:
|
||||
%p = getelementptr float, float addrspace(1)* %ptr, i64 -1
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -67,7 +67,7 @@ main_body:
|
|||
; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off
|
||||
define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
|
||||
main_body:
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
|
||||
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -76,7 +76,7 @@ main_body:
|
|||
define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
|
||||
main_body:
|
||||
%p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
|
||||
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -85,7 +85,7 @@ main_body:
|
|||
define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
|
||||
main_body:
|
||||
%p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
|
||||
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -94,7 +94,7 @@ main_body:
|
|||
; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget:
|
||||
; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
|
||||
define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 {
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -23,7 +23,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s6
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -49,7 +49,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -62,11 +62,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -11,7 +11,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
%ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -51,11 +51,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
|
|||
; CHECK-NEXT: s_mov_b32 s8, s2
|
||||
; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen
|
||||
; CHECK-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -29,12 +29,12 @@ define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64
|
|||
%cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
|
||||
%shl = shl i64 %cast, 2
|
||||
%castback = inttoptr i64 %shl to float addrspace(1)*
|
||||
call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %castback, float 100.0)
|
||||
call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %castback, float 100.0)
|
||||
store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #1
|
||||
declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { argmemonly nounwind willreturn }
|
||||
|
|
Loading…
Reference in New Issue