Correct NumLoads in clustering

Scheduler sends NumLoads argument into shouldClusterMemOps()
one less the actual cluster length. So for 2 instructions
it will pass just 1. Correct this number.

This is NFC for in tree targets.

Differential Revision: https://reviews.llvm.org/D73292
This commit is contained in:
Stanislav Mekhanoshin 2020-01-24 12:02:54 -08:00
parent 9c346464c1
commit be8e38cbd9
4 changed files with 12 additions and 4 deletions

View File

@ -1276,6 +1276,10 @@ public:
/// or
/// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
/// to TargetPassConfig::createMachineScheduler() to have an effect.
///
/// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
/// \p NumLoads is the number of loads that will be in the cluster if this
/// hook returns true.
virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2,
unsigned NumLoads) const {

View File

@ -1584,7 +1584,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
SUnit *SUb = MemOpRecords[Idx+1].SU;
if (TII->shouldClusterMemOps(MemOpRecords[Idx].BaseOps,
MemOpRecords[Idx + 1].BaseOps,
ClusterLength)) {
ClusterLength + 1)) {
if (SUa->NodeNum > SUb->NodeNum)
std::swap(SUa, SUb);
if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {

View File

@ -2422,7 +2422,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(
return false;
// Only cluster up to a single pair.
if (NumLoads > 1)
if (NumLoads > 2)
return false;
if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))

View File

@ -457,7 +457,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
(isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
const unsigned MaxGlobalLoadCluster = 6;
const unsigned MaxGlobalLoadCluster = 7;
if (NumLoads > MaxGlobalLoadCluster)
return false;
@ -497,7 +497,11 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
? MRI.getRegClass(Reg)
: RI.getPhysRegClass(Reg);
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
// FIXME: NumLoads should not be subtracted 1. This is to match behavior
// of clusterNeighboringMemOps which was previosly passing cluster length
// less 1. LoadClusterThreshold should be tuned instead.
return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
LoadClusterThreshold;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,