forked from OSchip/llvm-project
R600: Fix mishandling of load / store chains.
Fixes various bugs with reordering loads and stores. Scalarized vector loads weren't collecting the chains at all. llvm-svn: 212473
This commit is contained in:
parent
fda9dad17f
commit
d2c9e08b63
|
@ -1004,22 +1004,36 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
|
|||
SelectionDAG &DAG) const {
|
||||
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
|
||||
EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
|
||||
EVT LoadVT = Op.getValueType();
|
||||
EVT EltVT = Op.getValueType().getVectorElementType();
|
||||
EVT PtrVT = Load->getBasePtr().getValueType();
|
||||
|
||||
unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
|
||||
SmallVector<SDValue, 8> Loads;
|
||||
SmallVector<SDValue, 8> Chains;
|
||||
|
||||
SDLoc SL(Op);
|
||||
|
||||
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
||||
SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
|
||||
DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
|
||||
Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
|
||||
Load->getChain(), Ptr,
|
||||
MachinePointerInfo(Load->getMemOperand()->getValue()),
|
||||
MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
|
||||
Load->getAlignment()));
|
||||
|
||||
SDValue NewLoad
|
||||
= DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
|
||||
Load->getChain(), Ptr,
|
||||
MachinePointerInfo(Load->getMemOperand()->getValue()),
|
||||
MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
|
||||
Load->getAlignment());
|
||||
Loads.push_back(NewLoad.getValue(0));
|
||||
Chains.push_back(NewLoad.getValue(1));
|
||||
}
|
||||
return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
|
||||
|
||||
SDValue Ops[] = {
|
||||
DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
|
||||
DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
|
||||
};
|
||||
|
||||
return DAG.getMergeValues(Ops, SL);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
|
||||
|
@ -1122,7 +1136,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
Load->getBasePtr(),
|
||||
MemVT,
|
||||
Load->getMemOperand());
|
||||
return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
|
||||
|
||||
SDValue Ops[] = {
|
||||
DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
|
||||
ExtLoad32.getValue(1)
|
||||
};
|
||||
|
||||
return DAG.getMergeValues(Ops, DL);
|
||||
}
|
||||
|
||||
if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
|
||||
|
@ -1136,7 +1156,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
|
||||
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
|
||||
BasePtr, MVT::i8, MMO);
|
||||
return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
|
||||
|
||||
SDValue Ops[] = {
|
||||
DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
|
||||
NewLD.getValue(1)
|
||||
};
|
||||
|
||||
return DAG.getMergeValues(Ops, DL);
|
||||
}
|
||||
|
||||
// Lower loads constant address space global variable loads
|
||||
|
@ -1144,11 +1170,12 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
isa<GlobalVariable>(
|
||||
GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
|
||||
|
||||
|
||||
SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
|
||||
getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
|
||||
Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
|
||||
DAG.getConstant(2, MVT::i32));
|
||||
return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
|
||||
return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
|
||||
Load->getChain(), Ptr,
|
||||
DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
|
||||
}
|
||||
|
@ -1175,10 +1202,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
EVT MemEltVT = MemVT.getScalarType();
|
||||
if (ExtType == ISD::SEXTLOAD) {
|
||||
SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
|
||||
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
|
||||
|
||||
SDValue Ops[] = {
|
||||
DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
|
||||
Load->getChain()
|
||||
};
|
||||
|
||||
return DAG.getMergeValues(Ops, DL);
|
||||
}
|
||||
|
||||
return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
|
||||
SDValue Ops[] = {
|
||||
DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
|
||||
Load->getChain()
|
||||
};
|
||||
|
||||
return DAG.getMergeValues(Ops, DL);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
|
|
@ -578,7 +578,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
|
|||
case ISD::FSIN: return LowerTrig(Op, DAG);
|
||||
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
||||
case ISD::STORE: return LowerSTORE(Op, DAG);
|
||||
case ISD::LOAD: return LowerLOAD(Op, DAG);
|
||||
case ISD::LOAD: {
|
||||
SDValue Result = LowerLOAD(Op, DAG);
|
||||
assert((!Result.getNode() ||
|
||||
Result.getNode()->getNumValues() == 2) &&
|
||||
"Load should return a value and a chain");
|
||||
return Result;
|
||||
}
|
||||
|
||||
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
|
||||
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
|
||||
case ISD::INTRINSIC_VOID: {
|
||||
|
|
|
@ -618,13 +618,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
|
||||
(Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
|
||||
Op.getValueType().getVectorNumElements() > 4))) {
|
||||
SDValue MergedValues[2] = {
|
||||
SplitVectorLoad(Op, DAG),
|
||||
Load->getChain()
|
||||
};
|
||||
return DAG.getMergeValues(MergedValues, SDLoc(Op));
|
||||
return SplitVectorLoad(Op, DAG);
|
||||
} else {
|
||||
return LowerLOAD(Op, DAG);
|
||||
SDValue Result = LowerLOAD(Op, DAG);
|
||||
assert((!Result.getNode() ||
|
||||
Result.getNode()->getNumValues() == 2) &&
|
||||
"Load should return a value and a chain");
|
||||
return Result;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -841,13 +841,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|||
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDLoc DL(Op);
|
||||
LoadSDNode *Load = cast<LoadSDNode>(Op);
|
||||
SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
|
||||
SDValue MergedValues[2];
|
||||
MergedValues[1] = Load->getChain();
|
||||
if (Ret.getNode()) {
|
||||
MergedValues[0] = Ret;
|
||||
return DAG.getMergeValues(MergedValues, DL);
|
||||
}
|
||||
SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
|
||||
if (Lowered.getNode())
|
||||
return Lowered;
|
||||
|
||||
if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
|
||||
return SDValue();
|
||||
|
@ -860,25 +856,38 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
|
||||
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
|
||||
DAG.getConstant(2, MVT::i32));
|
||||
Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
|
||||
Load->getChain(), Ptr,
|
||||
DAG.getTargetConstant(0, MVT::i32),
|
||||
Op.getOperand(2));
|
||||
|
||||
// FIXME: REGISTER_LOAD should probably have a chain result.
|
||||
SDValue Chain = Load->getChain();
|
||||
SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
|
||||
Chain, Ptr,
|
||||
DAG.getTargetConstant(0, MVT::i32),
|
||||
Op.getOperand(2));
|
||||
|
||||
SDValue Ret = LoLoad.getValue(0);
|
||||
if (MemVT.getSizeInBits() == 64) {
|
||||
// TODO: This needs a test to make sure the right thing is happening with
|
||||
// the chain. That is hard without general function support.
|
||||
|
||||
SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
|
||||
DAG.getConstant(1, MVT::i32));
|
||||
|
||||
SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
|
||||
Load->getChain(), IncPtr,
|
||||
DAG.getTargetConstant(0, MVT::i32),
|
||||
Op.getOperand(2));
|
||||
SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
|
||||
Chain, IncPtr,
|
||||
DAG.getTargetConstant(0, MVT::i32),
|
||||
Op.getOperand(2));
|
||||
|
||||
Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
|
||||
Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad);
|
||||
// Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
|
||||
// LoLoad.getValue(1), HiLoad.getValue(1));
|
||||
}
|
||||
|
||||
MergedValues[0] = Ret;
|
||||
return DAG.getMergeValues(MergedValues, DL);
|
||||
SDValue Ops[] = {
|
||||
Ret,
|
||||
Chain
|
||||
};
|
||||
|
||||
return DAG.getMergeValues(Ops, DL);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
|
||||
|
||||
; SI-LABEL: @no_reorder_v2f64_global_load_store
|
||||
; SI: BUFFER_LOAD_DWORDX2
|
||||
; SI: BUFFER_LOAD_DWORDX2
|
||||
; SI: BUFFER_LOAD_DWORDX2
|
||||
; SI: BUFFER_LOAD_DWORDX2
|
||||
; SI: BUFFER_STORE_DWORDX2
|
||||
; SI: BUFFER_STORE_DWORDX2
|
||||
; SI: BUFFER_STORE_DWORDX2
|
||||
; SI: BUFFER_STORE_DWORDX2
|
||||
; SI: S_ENDPGM
|
||||
define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
|
||||
%tmp1 = load <2 x double> addrspace(1)* %x, align 16
|
||||
%tmp4 = load <2 x double> addrspace(1)* %y, align 16
|
||||
store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
|
||||
store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store
|
||||
; SI: DS_READ_B64
|
||||
; SI: DS_READ_B64
|
||||
; SI: DS_WRITE_B64
|
||||
; SI: DS_WRITE_B64
|
||||
; SI: S_ENDPGM
|
||||
define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
|
||||
%tmp1 = load <2 x double> addrspace(3)* %x, align 16
|
||||
%tmp4 = load <2 x double> addrspace(3)* %y, align 16
|
||||
store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
|
||||
store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @no_reorder_split_v8i32_global_load_store
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
; SI: BUFFER_LOAD_DWORD
|
||||
|
||||
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: BUFFER_STORE_DWORD
|
||||
; SI: S_ENDPGM
|
||||
define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
|
||||
%tmp1 = load <8 x i32> addrspace(1)* %x, align 32
|
||||
%tmp4 = load <8 x i32> addrspace(1)* %y, align 32
|
||||
store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
|
||||
store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @no_reorder_extload_64
|
||||
; SI: DS_READ_B64
|
||||
; SI: DS_READ_B64
|
||||
; SI: DS_WRITE_B64
|
||||
; SI-NOT: DS_READ
|
||||
; SI: DS_WRITE_B64
|
||||
; SI: S_ENDPGM
|
||||
define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
|
||||
%tmp1 = load <2 x i32> addrspace(3)* %x, align 8
|
||||
%tmp4 = load <2 x i32> addrspace(3)* %y, align 8
|
||||
%tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
|
||||
%tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
|
||||
%tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
|
||||
%tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
|
||||
%trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
|
||||
%trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
|
||||
store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
|
||||
store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue