AMDGPU: Move d16 load matching to preprocess step

When matching half of the build_vector to a load, there could still be
a hidden dependency on the other half of the build_vector the pattern
wouldn't detect. If there was an additional chain dependency on the
other value, a cycle could be introduced.

I don't think a tablegen pattern is capable of matching the necessary
conditions, so move this into PreprocessISelDAG. Check isPredecessorOf
for the other value to avoid a cycle. This has a warning that it's
expensive, so this should probably be moved into an MI pass eventually
that will have more freedom to reorder instructions to help match
this. That is currently complicated by the lack of a computeKnownBits
type mechanism for the selected function.

llvm-svn: 355731
This commit is contained in:
Matt Arsenault 2019-03-08 20:58:11 +00:00
parent ae56ff925b
commit e8c03a2511
13 changed files with 486 additions and 194 deletions

View File

@ -691,7 +691,7 @@ def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<"!FeatureUnpackedD16VMem">;
def D16PreservesUnusedBits :
Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;

View File

@ -51,6 +51,8 @@
#include <new>
#include <vector>
#define DEBUG_TYPE "isel"
using namespace llvm;
namespace llvm {
@ -88,7 +90,10 @@ public:
SelectionDAGISel::getAnalysisUsage(AU);
}
bool matchLoadD16FromBuildVector(SDNode *N) const;
bool runOnMachineFunction(MachineFunction &MF) override;
void PreprocessISelDAG() override;
void Select(SDNode *N) override;
StringRef getPassName() const override;
void PostprocessISelDAG() override;
@ -193,6 +198,7 @@ private:
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
SDValue getHi16Elt(SDValue In) const;
bool SelectHi16Elt(SDValue In, SDValue &Src) const;
void SelectADD_SUB_I64(SDNode *N);
@ -236,11 +242,49 @@ public:
SDValue &Offset) override;
bool runOnMachineFunction(MachineFunction &MF) override;
void PreprocessISelDAG() override {}
protected:
// Include the pieces autogenerated from the target description.
#include "R600GenDAGISel.inc"
};
static SDValue stripBitcast(SDValue Val) {
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
}
// Figure out if this is really an extract of the high 16-bits of a dword.
static bool isExtractHiElt(SDValue In, SDValue &Out) {
In = stripBitcast(In);
if (In.getOpcode() != ISD::TRUNCATE)
return false;
SDValue Srl = In.getOperand(0);
if (Srl.getOpcode() == ISD::SRL) {
if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
if (ShiftAmt->getZExtValue() == 16) {
Out = stripBitcast(Srl.getOperand(0));
return true;
}
}
}
return false;
}
// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
if (In.getOpcode() == ISD::TRUNCATE) {
SDValue Src = In.getOperand(0);
if (Src.getValueType().getSizeInBits() == 32)
return stripBitcast(Src);
}
return In;
}
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
@ -270,6 +314,114 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
return SelectionDAGISel::runOnMachineFunction(MF);
}
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
assert(Subtarget->d16PreservesUnusedBits());
MVT VT = N->getValueType(0).getSimpleVT();
if (VT != MVT::v2i16 && VT != MVT::v2f16)
return false;
SDValue Lo = N->getOperand(0);
SDValue Hi = N->getOperand(1);
LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
// build_vector lo, (load ptr) -> load_d16_hi ptr, lo
// build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
// build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
// Need to check for possible indirect dependencies on the other half of the
// vector to avoid introducing a cycle.
if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
SDValue Ops[] = {
LdHi->getChain(), LdHi->getBasePtr(), TiedIn
};
unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
if (LdHi->getMemoryVT() == MVT::i8) {
LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
} else {
assert(LdHi->getMemoryVT() == MVT::i16);
}
SDValue NewLoadHi =
CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
Ops, LdHi->getMemoryVT(),
LdHi->getMemOperand());
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
return true;
}
// build_vector (load ptr), hi -> load_d16_lo ptr, hi
// build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
// build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
if (LdLo && Lo.hasOneUse()) {
SDValue TiedIn = getHi16Elt(Hi);
if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
return false;
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
if (LdLo->getMemoryVT() == MVT::i8) {
LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
} else {
assert(LdLo->getMemoryVT() == MVT::i16);
}
TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
SDValue Ops[] = {
LdLo->getChain(), LdLo->getBasePtr(), TiedIn
};
SDValue NewLoadLo =
CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
Ops, LdLo->getMemoryVT(),
LdLo->getMemOperand());
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
return true;
}
return false;
}
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
if (!Subtarget->d16PreservesUnusedBits())
return;
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {
SDNode *N = &*--Position;
if (N->use_empty())
continue;
switch (N->getOpcode()) {
case ISD::BUILD_VECTOR:
MadeChange |= matchLoadD16FromBuildVector(N);
break;
default:
break;
}
}
if (MadeChange) {
CurDAG->RemoveDeadNodes();
LLVM_DEBUG(dbgs() << "After PreProcess:\n";
CurDAG->dump(););
}
}
bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
if (TM.Options.NoNaNsFPMath)
return true;
@ -1889,41 +2041,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
return true;
}
static SDValue stripBitcast(SDValue Val) {
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
}
// Figure out if this is really an extract of the high 16-bits of a dword.
static bool isExtractHiElt(SDValue In, SDValue &Out) {
In = stripBitcast(In);
if (In.getOpcode() != ISD::TRUNCATE)
return false;
SDValue Srl = In.getOperand(0);
if (Srl.getOpcode() == ISD::SRL) {
if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
if (ShiftAmt->getZExtValue() == 16) {
Out = stripBitcast(Srl.getOperand(0));
return true;
}
}
}
return false;
}
// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
if (In.getOpcode() == ISD::TRUNCATE) {
SDValue Src = In.getOperand(0);
if (Src.getValueType().getSizeInBits() == 32)
return stripBitcast(Src);
}
return In;
}
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
@ -2076,6 +2193,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
return true;
}
SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
if (In.isUndef())
return CurDAG->getUNDEF(MVT::i32);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
SDLoc SL(In);
return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
}
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
SDLoc SL(In);
return CurDAG->getConstant(
C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
}
SDValue Src;
if (isExtractHiElt(In, Src))
return Src;
return SDValue();
}
// TODO: Can we identify things like v_mad_mixhi_f16?
bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
if (In.isUndef()) {

View File

@ -4186,6 +4186,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(INTERP_P1LL_F16)
NODE_NAME_CASE(INTERP_P1LV_F16)
NODE_NAME_CASE(INTERP_P2_F16)
NODE_NAME_CASE(LOAD_D16_HI)
NODE_NAME_CASE(LOAD_D16_LO)
NODE_NAME_CASE(LOAD_D16_HI_I8)
NODE_NAME_CASE(LOAD_D16_HI_U8)
NODE_NAME_CASE(LOAD_D16_LO_I8)
NODE_NAME_CASE(LOAD_D16_LO_U8)
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)

View File

@ -469,6 +469,13 @@ enum NodeType : unsigned {
KILL,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
LOAD_D16_LO,
LOAD_D16_HI_I8,
LOAD_D16_HI_U8,
LOAD_D16_LO_I8,
LOAD_D16_LO_U8,
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,

View File

@ -802,7 +802,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
SDPatternOperator max_oneuse,
ValueType vt = i32> {
// This matches 16 permutations of
// This matches 16 permutations of
// min(max(a, b), max(min(a, b), c))
def : AMDGPUPat <
(min (max_oneuse vt:$src0, vt:$src1),
@ -810,7 +810,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst vt:$src0, vt:$src1, vt:$src2)
>;
// This matches 16 permutations of
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
def : AMDGPUPat <
(max (min_oneuse vt:$src0, vt:$src1),
@ -818,7 +818,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst $src0, $src1, $src2)
>;
}
// Special conversion patterns
def cvt_rpi_i32_f32 : PatFrag <

View File

@ -614,6 +614,10 @@ public:
return getGeneration() >= GFX9;
}
bool d16PreservesUnusedBits() const {
return hasD16LoadStore() && !isSRAMECCEnabled();
}
/// Return if most LDS instructions have an m0 use that require m0 to be
/// iniitalized.
bool ldsRequiresM0Init() const {

View File

@ -1376,60 +1376,17 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
}
// XXX - Is it possible to have a complex pattern in a PatFrag?
multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
ValueType vt, PatFrag ld> {
ValueType vt, PatFrag ld_frag> {
def : GCNPat <
(build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)))),
(v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
(ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
(InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $in)
>;
def : GCNPat <
(build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)))))),
(v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
>;
def : GCNPat <
(build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))),
(v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
>;
def : GCNPat <
(build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))),
(v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
>;
}
multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
ValueType vt, PatFrag ld> {
def : GCNPat <
(build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
(vt (Hi16Elt vt:$hi))),
(v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
>;
def : GCNPat <
(build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))))),
(f16 (Hi16Elt f16:$hi))),
(v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
>;
def : GCNPat <
(build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
(vt (Hi16Elt vt:$hi))),
(v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
>;
def : GCNPat <
(build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))),
(f16 (Hi16Elt f16:$hi))),
(v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
(ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
(InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $in)
>;
}
@ -1445,13 +1402,19 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSE
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
let OtherPredicates = [D16PreservesUnusedBits] in {
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>;
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>;
defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
}
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {

View File

@ -611,30 +611,10 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
}
multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
def : GCNPat <
(build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
(v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
>;
def : GCNPat <
(build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
(v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
>;
}
multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
def : GCNPat <
(build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))),
(v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi))
>;
def : GCNPat <
(build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))),
(v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi))
>;
}
class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
(inst $ptr, (as_i16imm $offset), (i1 0), $in)
>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
@ -656,16 +636,19 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
let OtherPredicates = [D16PreservesUnusedBits] in {
let AddedComplexity = 100 in {
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>;
def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2f16>;
def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2i16>;
def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2f16>;
defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>;
defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>;
defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>;
}
def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2i16>;
def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2f16>;
def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2i16>;
def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2f16>;
def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>;
def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
}
class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <

View File

@ -663,53 +663,15 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset, 0, $slc)
>;
multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
def : GCNPat <
(build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))),
(v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
>;
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc), vt:$in),
(inst $vaddr, $offset, 0, $slc, $in)
>;
def : GCNPat <
(build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))),
(v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
>;
}
multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
def : GCNPat <
(build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))),
(v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
>;
def : GCNPat <
(build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))),
(v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
>;
}
multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
def : GCNPat <
(build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
(v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
>;
def : GCNPat <
(build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
(v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
>;
}
multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
def : GCNPat <
(build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
(v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
>;
def : GCNPat <
(build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
(v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
>;
}
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc), vt:$in),
(inst $vaddr, $offset, 0, $slc, $in)
>;
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
@ -817,17 +779,19 @@ let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
let AddedComplexity = 3 in {
defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
}
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
let AddedComplexity = 9 in {
defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>;
defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>;
defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>;
}
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
@ -861,14 +825,19 @@ let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>;
defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>;
defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
@ -902,7 +871,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
} // End OtherPredicates = [HasFlatGlobalInsts]
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
//===----------------------------------------------------------------------===//

View File

@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
// load_d16_{lo|hi} ptr, tied_input
def SIload_d16 : SDTypeProfile<1, 2, [
SDTCisPtrTy<1>,
SDTCisSameAs<0, 2>
]>;
def SDTtbuffer_load : SDTypeProfile<1, 8,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@ -187,6 +194,36 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8",
SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8",
SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI",
SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8",
SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
//===----------------------------------------------------------------------===//
// ValueType helpers
//===----------------------------------------------------------------------===//
@ -384,6 +421,51 @@ def si_setcc_uniform : PatFrag <
return true;
}]>;
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for d16 loads
//===----------------------------------------------------------------------===//
class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>;
class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress;
class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress;
class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress;
class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress;
def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>;
def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>;
def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>;
def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>;
def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>;
def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>;
def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>;
def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>;
def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>;
def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>;
def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>;
def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>;
def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>;
def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>;
def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>;
def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>;
def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>;
def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>;
def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>;
def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>;
def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>;
def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>;
def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>;
def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>;
def lshr_rev : PatFrag <
(ops node:$src1, node:$src0),
(srl $src0, $src1)

View File

@ -4,9 +4,8 @@
; combine and a generic insert_vector_elt combine.
; GCN-LABEL: {{^}}combine_loop:
; GCN: flat_load_ushort
; GCN: flat_load_short_d16_hi
; GCN: flat_store_short
; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
define amdgpu_kernel void @combine_loop(i16* %arg) #0 {
bb:
br label %bb1

View File

@ -1,4 +1,4 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
; GCN-LABEL: {{^}}chain_hi_to_lo_private:
; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
@ -175,3 +175,128 @@ entry:
%loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
ret void
}
; There is another instruction between the misordered instruction and
; the value dependent load, so a simple operand check is insufficient.
; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep:
; GFX900: ds_read_u16_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
; GFX900-NEXT: ds_read_u16_d16 v1, v0 offset:2
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
%load_lo = load i16, i16 addrspace(3)* %gep_lo
%gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
%load_hi = load i16, i16 addrspace(3)* %gep_hi
%to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
%op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
%result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
ret <2 x i16> %result
}
; The volatile operations aren't put on the same chain
; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep_multi_chain:
; GFX900: ds_read_u16 v1, v0 offset:2
; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v1, v0
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
%load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
%gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
%load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
%to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
%op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
%result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
ret <2 x i16> %result
}
; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep:
; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
%load_lo = load i16, i16 addrspace(5)* %gep_lo
%gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
%load_hi = load i16, i16 addrspace(5)* %gep_hi
%to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
%op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
%result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
ret <2 x i16> %result
}
; GCN-LABEL: {{^}}chain_hi_to_lo_global_other_dep:
; GFX900: global_load_ushort v2, v[0:1], off offset:2
; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off
; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
%load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
%gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
%load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
%to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
%op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
%result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
ret <2 x i16> %result
}
; GCN-LABEL: {{^}}chain_hi_to_lo_flat_other_dep:
; GFX900: flat_load_ushort v2, v[0:1] offset:2
; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1]
; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
%load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
%gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
%load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
%to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
%op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
%result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
ret <2 x i16> %result
}
; GCN-LABEL: {{^}}chain_hi_to_lo_group_may_alias_store:
; GFX900: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b
; GFX900-NEXT: ds_read_u16 v3, v0
; GFX900-NEXT: ds_write_b16 v1, [[K]]
; GFX900-NEXT: ds_read_u16 v0, v0 offset:2
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
bb:
%gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
%gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
%load_hi = load i16, i16 addrspace(3)* %gep_hi
store i16 123, i16 addrspace(3)* %may.alias
%load_lo = load i16, i16 addrspace(3)* %gep_lo
%to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
%result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
ret <2 x i16> %result
}

View File

@ -880,6 +880,21 @@ entry:
ret <2 x i16> %build1
}
; FIXME: Remove and
; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
; GCN-NOT: ds_read
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
entry:
%gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
%load0 = load i16, i16 addrspace(3)* %in
%build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
ret <2 x i16> %build1
}
; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
; GFX900: ds_write_b16