forked from OSchip/llvm-project
Correct NumLoads in clustering
Scheduler sends NumLoads argument into shouldClusterMemOps() one less the actual cluster length. So for 2 instructions it will pass just 1. Correct this number. This is NFC for in tree targets. Differential Revision: https://reviews.llvm.org/D73292
This commit is contained in:
parent
9c346464c1
commit
be8e38cbd9
|
@ -1276,6 +1276,10 @@ public:
|
|||
/// or
|
||||
/// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
/// to TargetPassConfig::createMachineScheduler() to have an effect.
|
||||
///
|
||||
/// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
|
||||
/// \p NumLoads is the number of loads that will be in the cluster if this
|
||||
/// hook returns true.
|
||||
virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
|
||||
ArrayRef<const MachineOperand *> BaseOps2,
|
||||
unsigned NumLoads) const {
|
||||
|
|
|
@ -1584,7 +1584,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
|
|||
SUnit *SUb = MemOpRecords[Idx+1].SU;
|
||||
if (TII->shouldClusterMemOps(MemOpRecords[Idx].BaseOps,
|
||||
MemOpRecords[Idx + 1].BaseOps,
|
||||
ClusterLength)) {
|
||||
ClusterLength + 1)) {
|
||||
if (SUa->NodeNum > SUb->NodeNum)
|
||||
std::swap(SUa, SUb);
|
||||
if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
|
||||
|
|
|
@ -2422,7 +2422,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(
|
|||
return false;
|
||||
|
||||
// Only cluster up to a single pair.
|
||||
if (NumLoads > 1)
|
||||
if (NumLoads > 2)
|
||||
return false;
|
||||
|
||||
if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
|
||||
|
|
|
@ -457,7 +457,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
|
|||
if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
|
||||
(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
|
||||
(isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
|
||||
const unsigned MaxGlobalLoadCluster = 6;
|
||||
const unsigned MaxGlobalLoadCluster = 7;
|
||||
if (NumLoads > MaxGlobalLoadCluster)
|
||||
return false;
|
||||
|
||||
|
@ -497,7 +497,11 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
|
|||
? MRI.getRegClass(Reg)
|
||||
: RI.getPhysRegClass(Reg);
|
||||
|
||||
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
|
||||
// FIXME: NumLoads should not be subtracted 1. This is to match behavior
|
||||
// of clusterNeighboringMemOps which was previosly passing cluster length
|
||||
// less 1. LoadClusterThreshold should be tuned instead.
|
||||
return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
|
||||
LoadClusterThreshold;
|
||||
}
|
||||
|
||||
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
|
||||
|
|
Loading…
Reference in New Issue