R600: Fix mishandling of load / store chains.

Fixes various bugs with reordering loads and stores.
Scalarized vector loads weren't collecting the chains
at all.

llvm-svn: 212473
This commit is contained in:
Matt Arsenault 2014-07-07 18:34:45 +00:00
parent fda9dad17f
commit d2c9e08b63
4 changed files with 194 additions and 36 deletions

View File

@ -1004,22 +1004,36 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
SelectionDAG &DAG) const {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
EVT LoadVT = Op.getValueType();
EVT EltVT = Op.getValueType().getVectorElementType();
EVT PtrVT = Load->getBasePtr().getValueType();
unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
SmallVector<SDValue, 8> Loads;
SmallVector<SDValue, 8> Chains;
SDLoc SL(Op);
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
Load->getChain(), Ptr,
MachinePointerInfo(Load->getMemOperand()->getValue()),
MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
Load->getAlignment()));
SDValue NewLoad
= DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
Load->getChain(), Ptr,
MachinePointerInfo(Load->getMemOperand()->getValue()),
MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
Load->getAlignment());
Loads.push_back(NewLoad.getValue(0));
Chains.push_back(NewLoad.getValue(1));
}
return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
SDValue Ops[] = {
DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
};
return DAG.getMergeValues(Ops, SL);
}
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
@ -1122,7 +1136,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
Load->getBasePtr(),
MemVT,
Load->getMemOperand());
return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
SDValue Ops[] = {
DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
ExtLoad32.getValue(1)
};
return DAG.getMergeValues(Ops, DL);
}
if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
@ -1136,7 +1156,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
BasePtr, MVT::i8, MMO);
return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
SDValue Ops[] = {
DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
NewLD.getValue(1)
};
return DAG.getMergeValues(Ops, DL);
}
// Lower loads constant address space global variable loads
@ -1144,11 +1170,12 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
isa<GlobalVariable>(
GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
DAG.getConstant(2, MVT::i32));
return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
Load->getChain(), Ptr,
DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
}
@ -1175,10 +1202,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT MemEltVT = MemVT.getScalarType();
if (ExtType == ISD::SEXTLOAD) {
SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
SDValue Ops[] = {
DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
Load->getChain()
};
return DAG.getMergeValues(Ops, DL);
}
return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
SDValue Ops[] = {
DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
Load->getChain()
};
return DAG.getMergeValues(Ops, DL);
}
SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {

View File

@ -578,7 +578,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case ISD::FSIN: return LowerTrig(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);
assert((!Result.getNode() ||
Result.getNode()->getNumValues() == 2) &&
"Load should return a value and a chain");
return Result;
}
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
case ISD::INTRINSIC_VOID: {

View File

@ -618,13 +618,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
(Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
Op.getValueType().getVectorNumElements() > 4))) {
SDValue MergedValues[2] = {
SplitVectorLoad(Op, DAG),
Load->getChain()
};
return DAG.getMergeValues(MergedValues, SDLoc(Op));
return SplitVectorLoad(Op, DAG);
} else {
return LowerLOAD(Op, DAG);
SDValue Result = LowerLOAD(Op, DAG);
assert((!Result.getNode() ||
Result.getNode()->getNumValues() == 2) &&
"Load should return a value and a chain");
return Result;
}
}
@ -841,13 +841,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
SDValue MergedValues[2];
MergedValues[1] = Load->getChain();
if (Ret.getNode()) {
MergedValues[0] = Ret;
return DAG.getMergeValues(MergedValues, DL);
}
SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
if (Lowered.getNode())
return Lowered;
if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
@ -860,25 +856,38 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
DAG.getConstant(2, MVT::i32));
Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
Load->getChain(), Ptr,
DAG.getTargetConstant(0, MVT::i32),
Op.getOperand(2));
// FIXME: REGISTER_LOAD should probably have a chain result.
SDValue Chain = Load->getChain();
SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
Chain, Ptr,
DAG.getTargetConstant(0, MVT::i32),
Op.getOperand(2));
SDValue Ret = LoLoad.getValue(0);
if (MemVT.getSizeInBits() == 64) {
// TODO: This needs a test to make sure the right thing is happening with
// the chain. That is hard without general function support.
SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
DAG.getConstant(1, MVT::i32));
SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
Load->getChain(), IncPtr,
DAG.getTargetConstant(0, MVT::i32),
Op.getOperand(2));
SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
Chain, IncPtr,
DAG.getTargetConstant(0, MVT::i32),
Op.getOperand(2));
Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad);
// Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
// LoLoad.getValue(1), HiLoad.getValue(1));
}
MergedValues[0] = Ret;
return DAG.getMergeValues(MergedValues, DL);
SDValue Ops[] = {
Ret,
Chain
};
return DAG.getMergeValues(Ops, DL);
}
SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,

View File

@ -0,0 +1,104 @@
; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: @no_reorder_v2f64_global_load_store
; SI: BUFFER_LOAD_DWORDX2
; SI: BUFFER_LOAD_DWORDX2
; SI: BUFFER_LOAD_DWORDX2
; SI: BUFFER_LOAD_DWORDX2
; SI: BUFFER_STORE_DWORDX2
; SI: BUFFER_STORE_DWORDX2
; SI: BUFFER_STORE_DWORDX2
; SI: BUFFER_STORE_DWORDX2
; SI: S_ENDPGM
define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
%tmp1 = load <2 x double> addrspace(1)* %x, align 16
%tmp4 = load <2 x double> addrspace(1)* %y, align 16
store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
ret void
}
; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store
; SI: DS_READ_B64
; SI: DS_READ_B64
; SI: DS_WRITE_B64
; SI: DS_WRITE_B64
; SI: S_ENDPGM
define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
%tmp1 = load <2 x double> addrspace(3)* %x, align 16
%tmp4 = load <2 x double> addrspace(3)* %y, align 16
store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
ret void
}
; SI-LABEL: @no_reorder_split_v8i32_global_load_store
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_LOAD_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: BUFFER_STORE_DWORD
; SI: S_ENDPGM
define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
%tmp1 = load <8 x i32> addrspace(1)* %x, align 32
%tmp4 = load <8 x i32> addrspace(1)* %y, align 32
store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
ret void
}
; SI-LABEL: @no_reorder_extload_64
; SI: DS_READ_B64
; SI: DS_READ_B64
; SI: DS_WRITE_B64
; SI-NOT: DS_READ
; SI: DS_WRITE_B64
; SI: S_ENDPGM
define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
%tmp1 = load <2 x i32> addrspace(3)* %x, align 8
%tmp4 = load <2 x i32> addrspace(3)* %y, align 8
%tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
%tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
%tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
%tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
%trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
%trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
ret void
}