forked from OSchip/llvm-project
[AMDGPU][GlobalISel] Fix 96 and 128 local loads and stores
Fix local ds_read/write_b96/b128 so they can be selected if the alignment allows. Otherwise, either pick appropriate ds_read2/write2 instructions or break them down. Differential Revision: https://reviews.llvm.org/D81638
This commit is contained in:
parent
f5cd7ec9f3
commit
d17ea67b92
|
@ -1072,6 +1072,11 @@ def isGFX7GFX10 :
|
|||
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
|
||||
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>;
|
||||
|
||||
def isGFX7GFX8 :
|
||||
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
|
||||
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS">,
|
||||
AssemblerPredicate<(all_of FeatureSouthernIslands, FeatureCIInsts)>;
|
||||
|
||||
def isGFX7GFX8GFX9 :
|
||||
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
|
||||
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
|
||||
|
|
|
@ -93,6 +93,10 @@ def gi_ds_64bit_4byte_aligned :
|
|||
GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
|
||||
GIComplexPatternEquiv<DS64Bit4ByteAligned>;
|
||||
|
||||
def gi_ds_128bit_8byte_aligned :
|
||||
GIComplexOperandMatcher<s64, "selectDS128Bit8ByteAligned">,
|
||||
GIComplexPatternEquiv<DS128Bit8ByteAligned>;
|
||||
|
||||
def gi_mubuf_addr64 :
|
||||
GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
|
||||
GIComplexPatternEquiv<MUBUFAddr64>;
|
||||
|
|
|
@ -205,6 +205,10 @@ private:
|
|||
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
|
||||
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
|
||||
SDValue &Offset1) const;
|
||||
bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
|
||||
SDValue &Offset1) const;
|
||||
bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
|
||||
SDValue &Offset1, bool IsDS128) const;
|
||||
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
|
||||
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
|
||||
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
|
||||
|
@ -1231,38 +1235,52 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
|
|||
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
|
||||
SDValue &Offset0,
|
||||
SDValue &Offset1) const {
|
||||
return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, false);
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
|
||||
SDValue &Offset0,
|
||||
SDValue &Offset1) const {
|
||||
return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, true);
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
|
||||
SDValue &Offset0, SDValue &Offset1,
|
||||
bool IsDS128) const {
|
||||
SDLoc DL(Addr);
|
||||
unsigned Align = IsDS128 ? 8 : 4;
|
||||
|
||||
if (CurDAG->isBaseWithConstantOffset(Addr)) {
|
||||
SDValue N0 = Addr.getOperand(0);
|
||||
SDValue N1 = Addr.getOperand(1);
|
||||
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
|
||||
unsigned DWordOffset0 = C1->getZExtValue() / 4;
|
||||
unsigned DWordOffset1 = DWordOffset0 + 1;
|
||||
unsigned OffsetValue0 = C1->getZExtValue() / Align;
|
||||
unsigned OffsetValue1 = OffsetValue0 + 1;
|
||||
// (add n0, c0)
|
||||
if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
|
||||
if (isDSOffsetLegal(N0, OffsetValue1, 8)) {
|
||||
Base = N0;
|
||||
Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
|
||||
Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
|
||||
Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
|
||||
Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
|
||||
return true;
|
||||
}
|
||||
} else if (Addr.getOpcode() == ISD::SUB) {
|
||||
// sub C, x -> add (sub 0, x), C
|
||||
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
|
||||
unsigned DWordOffset0 = C->getZExtValue() / 4;
|
||||
unsigned DWordOffset1 = DWordOffset0 + 1;
|
||||
if (const ConstantSDNode *C =
|
||||
dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
|
||||
unsigned OffsetValue0 = C->getZExtValue() / Align;
|
||||
unsigned OffsetValue1 = OffsetValue0 + 1;
|
||||
|
||||
if (isUInt<8>(DWordOffset0)) {
|
||||
if (isUInt<8>(OffsetValue0)) {
|
||||
SDLoc DL(Addr);
|
||||
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
|
||||
|
||||
// XXX - This is kind of hacky. Create a dummy sub node so we can check
|
||||
// the known bits in isDSOffsetLegal. We need to emit the selected node
|
||||
// here, so this is thrown away.
|
||||
SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
|
||||
Zero, Addr.getOperand(1));
|
||||
SDValue Sub =
|
||||
CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
|
||||
|
||||
if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
|
||||
if (isDSOffsetLegal(Sub, OffsetValue1, 8)) {
|
||||
SmallVector<SDValue, 3> Opnds;
|
||||
Opnds.push_back(Zero);
|
||||
Opnds.push_back(Addr.getOperand(1));
|
||||
|
@ -1273,29 +1291,28 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
|
|||
CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
|
||||
}
|
||||
|
||||
MachineSDNode *MachineSub
|
||||
= CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
|
||||
MachineSDNode *MachineSub = CurDAG->getMachineNode(
|
||||
SubOp, DL, (IsDS128 ? MVT::i64 : MVT::i32), Opnds);
|
||||
|
||||
Base = SDValue(MachineSub, 0);
|
||||
Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
|
||||
Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
|
||||
Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
|
||||
Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
|
||||
unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
|
||||
unsigned DWordOffset1 = DWordOffset0 + 1;
|
||||
assert(4 * DWordOffset0 == CAddr->getZExtValue());
|
||||
unsigned OffsetValue0 = CAddr->getZExtValue() / Align;
|
||||
unsigned OffsetValue1 = OffsetValue0 + 1;
|
||||
assert(Align * OffsetValue0 == CAddr->getZExtValue());
|
||||
|
||||
if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
|
||||
if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) {
|
||||
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
|
||||
MachineSDNode *MovZero
|
||||
= CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
|
||||
DL, MVT::i32, Zero);
|
||||
MachineSDNode *MovZero =
|
||||
CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
|
||||
Base = SDValue(MovZero, 0);
|
||||
Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
|
||||
Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
|
||||
Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8);
|
||||
Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3545,9 +3545,20 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
|
|||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
|
||||
return selectDSReadWrite2(Root, false);
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
|
||||
return selectDSReadWrite2(Root, true);
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
|
||||
bool IsDS128) const {
|
||||
Register Reg;
|
||||
unsigned Offset;
|
||||
std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
|
||||
std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, IsDS128);
|
||||
return {{
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
|
||||
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
|
||||
|
@ -3556,7 +3567,8 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const
|
|||
}
|
||||
|
||||
std::pair<Register, unsigned>
|
||||
AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
|
||||
AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
|
||||
bool IsDS128) const {
|
||||
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
|
||||
if (!RootDef)
|
||||
return std::make_pair(Root.getReg(), 0);
|
||||
|
@ -3569,11 +3581,11 @@ AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) c
|
|||
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
|
||||
|
||||
if (Offset) {
|
||||
int64_t DWordOffset0 = Offset / 4;
|
||||
int64_t DWordOffset1 = DWordOffset0 + 1;
|
||||
if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
|
||||
int64_t OffsetValue0 = Offset / (IsDS128 ? 8 : 4);
|
||||
int64_t OffsetValue1 = OffsetValue0 + 1;
|
||||
if (isDSOffsetLegal(PtrBase, OffsetValue1, (IsDS128 ? 16 : 8))) {
|
||||
// (add n0, c0)
|
||||
return std::make_pair(PtrBase, DWordOffset0);
|
||||
return std::make_pair(PtrBase, OffsetValue0);
|
||||
}
|
||||
} else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
|
||||
// TODO
|
||||
|
|
|
@ -202,11 +202,17 @@ private:
|
|||
InstructionSelector::ComplexRendererFns
|
||||
selectDS1Addr1Offset(MachineOperand &Root) const;
|
||||
|
||||
std::pair<Register, unsigned>
|
||||
selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectDS64Bit4ByteAligned(MachineOperand &Root) const;
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectDS128Bit8ByteAligned(MachineOperand &Root) const;
|
||||
|
||||
std::pair<Register, unsigned>
|
||||
selectDSReadWrite2Impl(MachineOperand &Root, bool IsDS128) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectDSReadWrite2(MachineOperand &Root, bool IsDS128) const;
|
||||
|
||||
std::pair<Register, int64_t>
|
||||
getPtrBaseWithConstantOffset(Register Root,
|
||||
const MachineRegisterInfo &MRI) const;
|
||||
|
|
|
@ -485,17 +485,16 @@ defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
|
|||
defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
|
||||
defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
|
||||
|
||||
|
||||
def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
|
||||
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
|
||||
Aligned<8> {
|
||||
let IsLoad = 1;
|
||||
let IsNonExtLoad = 1;
|
||||
let MinAlignment = 8;
|
||||
}
|
||||
|
||||
def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
|
||||
def load_align16_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
|
||||
Aligned<16> {
|
||||
let IsLoad = 1;
|
||||
let IsNonExtLoad = 1;
|
||||
let MinAlignment = 16;
|
||||
}
|
||||
|
||||
def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
|
||||
|
|
|
@ -680,7 +680,29 @@ foreach vt = VReg_64.RegTypes in {
|
|||
defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
|
||||
}
|
||||
|
||||
defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
|
||||
let SubtargetPredicate = isGFX7GFX8 in {
|
||||
|
||||
foreach vt = VReg_96.RegTypes in {
|
||||
defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
|
||||
}
|
||||
|
||||
foreach vt = VReg_128.RegTypes in {
|
||||
defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGFX9Plus in {
|
||||
|
||||
foreach vt = VReg_96.RegTypes in {
|
||||
defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
|
||||
}
|
||||
|
||||
foreach vt = VReg_128.RegTypes in {
|
||||
defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // End AddedComplexity = 100
|
||||
|
||||
|
@ -761,6 +783,18 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> :
|
|||
(i1 0))
|
||||
>;
|
||||
|
||||
class DS128Bit8ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
|
||||
(vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
|
||||
(inst $ptr, $offset0, $offset1, (i1 0))
|
||||
>;
|
||||
|
||||
class DS128Bit8ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
|
||||
(frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
|
||||
(inst $ptr, (i64 (EXTRACT_SUBREG VReg_128:$value, sub0_sub1)),
|
||||
(i64 (EXTRACT_SUBREG VReg_128:$value, sub2_sub3)), $offset0, $offset1,
|
||||
(i1 0))
|
||||
>;
|
||||
|
||||
multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
|
||||
let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
|
||||
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
|
||||
|
@ -773,19 +807,57 @@ multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
|
|||
}
|
||||
}
|
||||
|
||||
multiclass DS128Bit8ByteAlignedPat_mc<ValueType vt> {
|
||||
let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
|
||||
def : DS128Bit8ByteAlignedReadPat<DS_READ2_B64, vt, load_local_m0>;
|
||||
def : DS128Bit8ByteAlignedWritePat<DS_WRITE2_B64, vt, store_local_m0>;
|
||||
}
|
||||
|
||||
let OtherPredicates = [NotLDSRequiresM0Init] in {
|
||||
def : DS128Bit8ByteAlignedReadPat<DS_READ2_B64_gfx9, vt, load_local>;
|
||||
def : DS128Bit8ByteAlignedWritePat<DS_WRITE2_B64_gfx9, vt, store_local>;
|
||||
}
|
||||
}
|
||||
|
||||
// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
|
||||
// related to bounds checking.
|
||||
foreach vt = VReg_64.RegTypes in {
|
||||
defm : DS64Bit4ByteAlignedPat_mc<vt>;
|
||||
}
|
||||
|
||||
foreach vt = VReg_128.RegTypes in {
|
||||
defm : DS128Bit8ByteAlignedPat_mc<vt>;
|
||||
}
|
||||
|
||||
let AddedComplexity = 100 in {
|
||||
|
||||
foreach vt = VReg_64.RegTypes in {
|
||||
defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
|
||||
}
|
||||
|
||||
defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
|
||||
let SubtargetPredicate = isGFX7GFX8 in {
|
||||
|
||||
foreach vt = VReg_96.RegTypes in {
|
||||
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
|
||||
}
|
||||
|
||||
foreach vt = VReg_128.RegTypes in {
|
||||
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
let SubtargetPredicate = isGFX9Plus in {
|
||||
|
||||
foreach vt = VReg_96.RegTypes in {
|
||||
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
|
||||
}
|
||||
|
||||
foreach vt = VReg_128.RegTypes in {
|
||||
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // End AddedComplexity = 100
|
||||
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
|
||||
|
|
|
@ -433,16 +433,15 @@ def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)
|
|||
}
|
||||
|
||||
def load_align8_local_m0 : PatFrag<(ops node:$ptr),
|
||||
(load_local_m0 node:$ptr)> {
|
||||
(load_local_m0 node:$ptr)>, Aligned<8> {
|
||||
let IsLoad = 1;
|
||||
let IsNonExtLoad = 1;
|
||||
let MinAlignment = 8;
|
||||
}
|
||||
|
||||
def load_align16_local_m0 : PatFrag<(ops node:$ptr),
|
||||
(load_local_m0 node:$ptr)> {
|
||||
(load_local_m0 node:$ptr)>, Aligned<16> {
|
||||
let IsLoad = 1;
|
||||
let IsNonExtLoad = 1;
|
||||
let MinAlignment = 16;
|
||||
}
|
||||
|
||||
} // End IsLoad = 1
|
||||
|
@ -518,20 +517,18 @@ def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
|
|||
}
|
||||
}
|
||||
|
||||
def store_align16_local_m0 : PatFrag <
|
||||
(ops node:$value, node:$ptr),
|
||||
(store_local_m0 node:$value, node:$ptr)> {
|
||||
def store_align8_local_m0 : PatFrag <(ops node:$value, node:$ptr),
|
||||
(store_local_m0 node:$value, node:$ptr)>,
|
||||
Aligned<8> {
|
||||
let IsStore = 1;
|
||||
let IsTruncStore = 0;
|
||||
let MinAlignment = 16;
|
||||
}
|
||||
|
||||
def store_align8_local_m0 : PatFrag <
|
||||
(ops node:$value, node:$ptr),
|
||||
(store_local_m0 node:$value, node:$ptr)> {
|
||||
def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr),
|
||||
(store_local_m0 node:$value, node:$ptr)>,
|
||||
Aligned<16> {
|
||||
let IsStore = 1;
|
||||
let IsTruncStore = 0;
|
||||
let MinAlignment = 8;
|
||||
}
|
||||
|
||||
let AddressSpaces = StoreAddress_local.AddrSpaces in {
|
||||
|
@ -1296,6 +1293,7 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
|
|||
|
||||
def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
|
||||
def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
|
||||
def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">;
|
||||
|
||||
def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
|
||||
|
||||
|
|
|
@ -42,15 +42,15 @@ body: |
|
|||
|
||||
; GFX7-LABEL: name: load_local_v4s32_align_8
|
||||
; GFX7: liveins: $vgpr0
|
||||
; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
|
||||
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX7: $m0 = S_MOV_B32 -1
|
||||
; GFX7: [[LOAD:%[0-9]+]]:vreg_128(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
|
||||
; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
|
||||
; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3)
|
||||
; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
|
||||
; GFX9-LABEL: name: load_local_v4s32_align_8
|
||||
; GFX9: liveins: $vgpr0
|
||||
; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
|
||||
; GFX9: [[LOAD:%[0-9]+]]:vreg_128(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
|
||||
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
|
||||
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3)
|
||||
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
|
||||
%0:vgpr(p3) = COPY $vgpr0
|
||||
%1:vgpr(<4 x s32>) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
|
||||
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
|
||||
|
@ -70,15 +70,15 @@ body: |
|
|||
|
||||
; GFX7-LABEL: name: load_local_v2s64
|
||||
; GFX7: liveins: $vgpr0
|
||||
; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
|
||||
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX7: $m0 = S_MOV_B32 -1
|
||||
; GFX7: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
|
||||
; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
|
||||
; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3)
|
||||
; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
|
||||
; GFX9-LABEL: name: load_local_v2s64
|
||||
; GFX9: liveins: $vgpr0
|
||||
; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
|
||||
; GFX9: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
|
||||
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
|
||||
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3)
|
||||
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
|
||||
%0:vgpr(p3) = COPY $vgpr0
|
||||
%1:vgpr(<2 x s64>) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
|
||||
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
|
||||
|
@ -126,15 +126,15 @@ body: |
|
|||
|
||||
; GFX7-LABEL: name: load_local_s128
|
||||
; GFX7: liveins: $vgpr0
|
||||
; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
|
||||
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX7: $m0 = S_MOV_B32 -1
|
||||
; GFX7: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
|
||||
; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
|
||||
; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3)
|
||||
; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]]
|
||||
; GFX9-LABEL: name: load_local_s128
|
||||
; GFX9: liveins: $vgpr0
|
||||
; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
|
||||
; GFX9: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
|
||||
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
|
||||
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3)
|
||||
; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
|
||||
%0:vgpr(p3) = COPY $vgpr0
|
||||
%1:vgpr(s128) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
|
||||
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
|
||||
|
|
|
@ -0,0 +1,300 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
||||
|
||||
; FIXME:
|
||||
; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
|
||||
|
||||
define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v4i32:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr
|
||||
ret <4 x i32> %load
|
||||
}
|
||||
|
||||
define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_u8 v1, v0
|
||||
; GFX9-NEXT: ds_read_u8 v2, v0 offset:1
|
||||
; GFX9-NEXT: ds_read_u8 v4, v0 offset:2
|
||||
; GFX9-NEXT: ds_read_u8 v5, v0 offset:3
|
||||
; GFX9-NEXT: ds_read_u8 v6, v0 offset:4
|
||||
; GFX9-NEXT: ds_read_u8 v7, v0 offset:5
|
||||
; GFX9-NEXT: ds_read_u8 v8, v0 offset:6
|
||||
; GFX9-NEXT: ds_read_u8 v9, v0 offset:7
|
||||
; GFX9-NEXT: s_mov_b32 s5, 8
|
||||
; GFX9-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, s4, v4
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX9-NEXT: v_and_b32_e32 v4, s4, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0xff
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4
|
||||
; GFX9-NEXT: v_or3_b32 v4, v1, v2, v4
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, v8, v3
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v5, v9, v3
|
||||
; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5
|
||||
; GFX9-NEXT: v_or3_b32 v1, v1, v2, v5
|
||||
; GFX9-NEXT: ds_read_u8 v2, v0 offset:8
|
||||
; GFX9-NEXT: ds_read_u8 v6, v0 offset:9
|
||||
; GFX9-NEXT: ds_read_u8 v7, v0 offset:10
|
||||
; GFX9-NEXT: ds_read_u8 v8, v0 offset:11
|
||||
; GFX9-NEXT: ds_read_u8 v9, v0 offset:12
|
||||
; GFX9-NEXT: ds_read_u8 v10, v0 offset:13
|
||||
; GFX9-NEXT: ds_read_u8 v11, v0 offset:14
|
||||
; GFX9-NEXT: ds_read_u8 v0, v0 offset:15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, 8
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_and_or_b32 v2, v2, v3, v6
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, v7, v3
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX9-NEXT: v_and_b32_e32 v7, v8, v3
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, v0, v3
|
||||
; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, v11, v3
|
||||
; GFX9-NEXT: v_and_or_b32 v5, v9, v3, v5
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
|
||||
; GFX9-NEXT: v_or3_b32 v3, v5, v6, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v4i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX7-NEXT: ds_read_u8 v1, v0
|
||||
; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
|
||||
; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
|
||||
; GFX7-NEXT: ds_read_u8 v5, v0 offset:3
|
||||
; GFX7-NEXT: ds_read_u8 v6, v0 offset:4
|
||||
; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
|
||||
; GFX7-NEXT: ds_read_u8 v8, v0 offset:6
|
||||
; GFX7-NEXT: ds_read_u8 v9, v0 offset:7
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, 0xff
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v7, v3
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v8, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v9, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
|
||||
; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
|
||||
; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
|
||||
; GFX7-NEXT: ds_read_u8 v7, v0 offset:11
|
||||
; GFX7-NEXT: ds_read_u8 v8, v0 offset:12
|
||||
; GFX7-NEXT: ds_read_u8 v9, v0 offset:13
|
||||
; GFX7-NEXT: ds_read_u8 v10, v0 offset:14
|
||||
; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v5, v3
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v2, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v6, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v6, v9, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v8, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v6, v10, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v0, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v3, v5, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
|
||||
ret <4 x i32> %load
|
||||
}
|
||||
|
||||
define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align2:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0xffff
|
||||
; GFX9-NEXT: ds_read_u16 v1, v0
|
||||
; GFX9-NEXT: ds_read_u16 v2, v0 offset:2
|
||||
; GFX9-NEXT: ds_read_u16 v3, v0 offset:4
|
||||
; GFX9-NEXT: ds_read_u16 v4, v0 offset:6
|
||||
; GFX9-NEXT: ds_read_u16 v5, v0 offset:8
|
||||
; GFX9-NEXT: ds_read_u16 v6, v0 offset:10
|
||||
; GFX9-NEXT: ds_read_u16 v7, v0 offset:12
|
||||
; GFX9-NEXT: ds_read_u16 v8, v0 offset:14
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, s4, v2
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, s4, v4
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, s4, v6
|
||||
; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, s4, v8
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2
|
||||
; GFX9-NEXT: v_and_or_b32 v3, v7, s4, v3
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v4i32_align2:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_u16 v1, v0
|
||||
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
|
||||
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
|
||||
; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
|
||||
; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
|
||||
; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
|
||||
; GFX7-NEXT: ds_read_u16 v7, v0 offset:12
|
||||
; GFX7-NEXT: ds_read_u16 v8, v0 offset:14
|
||||
; GFX7-NEXT: s_mov_b32 s4, 0xffff
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(7)
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, s4, v1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v3, s4, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v8
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; GFX7-NEXT: v_and_b32_e32 v3, s4, v7
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
|
||||
ret <4 x i32> %load
|
||||
}
|
||||
|
||||
define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v4i32_align4:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
|
||||
ret <4 x i32> %load
|
||||
}
|
||||
|
||||
define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v4i32_align8:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
|
||||
ret <4 x i32> %load
|
||||
}
|
||||
|
||||
define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v4i32_align16:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16
|
||||
ret <4 x i32> %load
|
||||
}
|
|
@ -0,0 +1,260 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
||||
|
||||
; FIXME:
|
||||
; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
|
||||
|
||||
define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v3i32:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr
|
||||
ret <3 x i32> %load
|
||||
}
|
||||
|
||||
define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-NEXT: ds_read_u8 v0, v0
|
||||
; GFX9-NEXT: ds_read_u8 v1, v2 offset:1
|
||||
; GFX9-NEXT: ds_read_u8 v4, v2 offset:2
|
||||
; GFX9-NEXT: ds_read_u8 v5, v2 offset:3
|
||||
; GFX9-NEXT: ds_read_u8 v6, v2 offset:4
|
||||
; GFX9-NEXT: ds_read_u8 v7, v2 offset:5
|
||||
; GFX9-NEXT: ds_read_u8 v8, v2 offset:6
|
||||
; GFX9-NEXT: ds_read_u8 v9, v2 offset:7
|
||||
; GFX9-NEXT: s_mov_b32 s5, 8
|
||||
; GFX9-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, s4, v4
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX9-NEXT: v_and_b32_e32 v4, s4, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0xff
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4
|
||||
; GFX9-NEXT: v_or3_b32 v0, v0, v1, v4
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NEXT: v_and_b32_e32 v4, v8, v3
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v5, v9, v3
|
||||
; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5
|
||||
; GFX9-NEXT: v_or3_b32 v1, v1, v4, v5
|
||||
; GFX9-NEXT: ds_read_u8 v4, v2 offset:8
|
||||
; GFX9-NEXT: ds_read_u8 v5, v2 offset:9
|
||||
; GFX9-NEXT: ds_read_u8 v6, v2 offset:10
|
||||
; GFX9-NEXT: ds_read_u8 v2, v2 offset:11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, 8
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_and_or_b32 v4, v4, v3, v5
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX9-NEXT: v_and_b32_e32 v5, v6, v3
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; GFX9-NEXT: v_or3_b32 v2, v4, v5, v2
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v3i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX7-NEXT: ds_read_u8 v0, v0
|
||||
; GFX7-NEXT: ds_read_u8 v1, v2 offset:1
|
||||
; GFX7-NEXT: ds_read_u8 v4, v2 offset:2
|
||||
; GFX7-NEXT: ds_read_u8 v5, v2 offset:3
|
||||
; GFX7-NEXT: ds_read_u8 v6, v2 offset:4
|
||||
; GFX7-NEXT: ds_read_u8 v7, v2 offset:5
|
||||
; GFX7-NEXT: ds_read_u8 v8, v2 offset:6
|
||||
; GFX7-NEXT: ds_read_u8 v9, v2 offset:7
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v4
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, 0xff
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v7, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v8, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v9, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
|
||||
; GFX7-NEXT: ds_read_u8 v4, v2 offset:8
|
||||
; GFX7-NEXT: ds_read_u8 v5, v2 offset:9
|
||||
; GFX7-NEXT: ds_read_u8 v6, v2 offset:10
|
||||
; GFX7-NEXT: ds_read_u8 v2, v2 offset:11
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v4, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v5, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v6, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v2, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
|
||||
ret <3 x i32> %load
|
||||
}
|
||||
|
||||
define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align2:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_u16 v1, v0
|
||||
; GFX9-NEXT: ds_read_u16 v2, v0 offset:2
|
||||
; GFX9-NEXT: ds_read_u16 v3, v0 offset:4
|
||||
; GFX9-NEXT: ds_read_u16 v4, v0 offset:6
|
||||
; GFX9-NEXT: ds_read_u16 v5, v0 offset:8
|
||||
; GFX9-NEXT: ds_read_u16 v6, v0 offset:10
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0xffff
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, s4, v2
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, s4, v4
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, s4, v6
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1
|
||||
; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v3i32_align2:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_u16 v1, v0
|
||||
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
|
||||
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
|
||||
; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
|
||||
; GFX7-NEXT: ds_read_u16 v5, v0 offset:8
|
||||
; GFX7-NEXT: ds_read_u16 v6, v0 offset:10
|
||||
; GFX7-NEXT: s_mov_b32 s4, 0xffff
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, s4, v1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v3, s4, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
|
||||
ret <3 x i32> %load
|
||||
}
|
||||
|
||||
define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v3i32_align4:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
|
||||
; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
|
||||
ret <3 x i32> %load
|
||||
}
|
||||
|
||||
define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v3i32_align8:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_b64 v[0:1], v0
|
||||
; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8
|
||||
ret <3 x i32> %load
|
||||
}
|
||||
|
||||
define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v3i32_align16:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
|
||||
ret <3 x i32> %load
|
||||
}
|
|
@ -0,0 +1,252 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
||||
|
||||
; Unaligned DS access in available from GFX9 onwards.
|
||||
; LDS alignment enforcement is controlled by a configuration register:
|
||||
; SH_MEM_CONFIG.alignment_mode
|
||||
|
||||
define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v4i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b128 v[0:3], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v4i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX7-NEXT: ds_read_u8 v1, v0
|
||||
; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
|
||||
; GFX7-NEXT: ds_read_u8 v4, v0 offset:2
|
||||
; GFX7-NEXT: ds_read_u8 v5, v0 offset:3
|
||||
; GFX7-NEXT: ds_read_u8 v6, v0 offset:4
|
||||
; GFX7-NEXT: ds_read_u8 v7, v0 offset:5
|
||||
; GFX7-NEXT: ds_read_u8 v8, v0 offset:6
|
||||
; GFX7-NEXT: ds_read_u8 v9, v0 offset:7
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, s4, v5
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, 0xff
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v7, v3
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v8, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v9, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
|
||||
; GFX7-NEXT: ds_read_u8 v5, v0 offset:9
|
||||
; GFX7-NEXT: ds_read_u8 v6, v0 offset:10
|
||||
; GFX7-NEXT: ds_read_u8 v7, v0 offset:11
|
||||
; GFX7-NEXT: ds_read_u8 v8, v0 offset:12
|
||||
; GFX7-NEXT: ds_read_u8 v9, v0 offset:13
|
||||
; GFX7-NEXT: ds_read_u8 v10, v0 offset:14
|
||||
; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v5, v3
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v2, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v6, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v6, v9, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v8, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v6, v10, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v0, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v3, v5, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
|
||||
ret <4 x i32> %load
|
||||
}
|
||||
|
||||
define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
|
||||
; GFX9-LABEL: load_lds_v3i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_read_b96 v[0:2], v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: load_lds_v3i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX7-NEXT: ds_read_u8 v0, v0
|
||||
; GFX7-NEXT: ds_read_u8 v1, v2 offset:1
|
||||
; GFX7-NEXT: ds_read_u8 v4, v2 offset:2
|
||||
; GFX7-NEXT: ds_read_u8 v5, v2 offset:3
|
||||
; GFX7-NEXT: ds_read_u8 v6, v2 offset:4
|
||||
; GFX7-NEXT: ds_read_u8 v7, v2 offset:5
|
||||
; GFX7-NEXT: ds_read_u8 v8, v2 offset:6
|
||||
; GFX7-NEXT: ds_read_u8 v9, v2 offset:7
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v4
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, 0xff
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v7, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s4, v6
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v8, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v9, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4
|
||||
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
|
||||
; GFX7-NEXT: ds_read_u8 v4, v2 offset:8
|
||||
; GFX7-NEXT: ds_read_u8 v5, v2 offset:9
|
||||
; GFX7-NEXT: ds_read_u8 v6, v2 offset:10
|
||||
; GFX7-NEXT: ds_read_u8 v2, v2 offset:11
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; GFX7-NEXT: v_and_b32_e32 v4, v4, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v5, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v5, v6, v3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v2, v2, v3
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
|
||||
; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
|
||||
ret <3 x i32> %load
|
||||
}
|
||||
|
||||
define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v4i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_write_b128 v0, v[1:4]
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v2
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v7 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v8 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v0, v9 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v10 offset:7
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v4
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v3 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:11
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:12
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:13
|
||||
; GFX7-NEXT: ds_write_b8 v0, v7 offset:14
|
||||
; GFX7-NEXT: ds_write_b8 v0, v8 offset:15
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v3i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: ds_write_b96 v0, v[1:3]
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v0, v5 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v0, v6 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v0, v7 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v0, v8 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v0, v9 offset:7
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v3
|
||||
; GFX7-NEXT: ds_write_b8 v0, v3 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v0, v2 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v0, v4 offset:11
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,301 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
||||
|
||||
; FIXME:
|
||||
; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
|
||||
|
||||
define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v4i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v4i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s7, s0, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s1, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s2, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s3, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX9-NEXT: ds_write_b8 v1, v5 offset:12
|
||||
; GFX9-NEXT: ds_write_b8 v1, v6 offset:13
|
||||
; GFX9-NEXT: ds_write_b8 v1, v7 offset:14
|
||||
; GFX9-NEXT: ds_write_b8 v1, v8 offset:15
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s0, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s6, s0, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s7, s0, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s1, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s1, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s2, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s2, s3, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX7-NEXT: ds_write_b8 v1, v5 offset:12
|
||||
; GFX7-NEXT: ds_write_b8 v1, v6 offset:13
|
||||
; GFX7-NEXT: ds_write_b8 v1, v7 offset:14
|
||||
; GFX7-NEXT: ds_write_b8 v1, v8 offset:15
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v4i32_align2:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX9-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX9-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX9-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX9-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX9-NEXT: ds_write_b16 v1, v7 offset:12
|
||||
; GFX9-NEXT: ds_write_b16 v1, v8 offset:14
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align2:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX7-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX7-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX7-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX7-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX7-NEXT: ds_write_b16 v1, v7 offset:12
|
||||
; GFX7-NEXT: ds_write_b16 v1, v8 offset:14
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v4i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align4:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset1:1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-NEXT: ds_write2_b32 v1, v0, v2 offset0:2 offset1:3
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v4i32_align8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align8:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v4i32_align16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v4i32_align16:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: ds_write_b128 v4, v[0:3]
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,262 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
||||
|
||||
; FIXME:
|
||||
; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
|
||||
|
||||
define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v3i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v3i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s1, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX9-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX9-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX9-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s0, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s5, s0, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s6, s0, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s4, s1, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:1
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:2
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v5 offset:4
|
||||
; GFX7-NEXT: ds_write_b8 v1, v6 offset:5
|
||||
; GFX7-NEXT: ds_write_b8 v1, v7 offset:6
|
||||
; GFX7-NEXT: ds_write_b8 v1, v8 offset:7
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 8
|
||||
; GFX7-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX7-NEXT: ds_write_b8 v1, v2 offset:9
|
||||
; GFX7-NEXT: ds_write_b8 v1, v3 offset:10
|
||||
; GFX7-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v3i32_align2:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v0
|
||||
; GFX9-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX9-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX9-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX9-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX9-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align2:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s1, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v0
|
||||
; GFX7-NEXT: ds_write_b16 v1, v2 offset:2
|
||||
; GFX7-NEXT: ds_write_b16 v1, v3 offset:4
|
||||
; GFX7-NEXT: ds_write_b16 v1, v4 offset:6
|
||||
; GFX7-NEXT: ds_write_b16 v1, v5 offset:8
|
||||
; GFX7-NEXT: ds_write_b16 v1, v6 offset:10
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v3i32_align4:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align4:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX7-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
|
||||
; GFX7-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v3i32_align8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align8:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX7-NEXT: ds_write_b64 v2, v[0:1]
|
||||
; GFX7-NEXT: ds_write_b32 v2, v3 offset:8
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
|
||||
; GFX9-LABEL: store_lds_v3i32_align16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: store_lds_v3i32_align16:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: ds_write_b96 v3, v[0:2]
|
||||
; GFX7-NEXT: s_endpgm
|
||||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue