llvm-project/llvm/lib/CodeGen/RegAllocScore.cpp

//===- RegAllocScore.cpp - evaluate regalloc policy quality ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// Calculate a measure of the register allocation policy quality. This is used
/// to construct a reward for the training of the ML-driven allocation policy.
/// Currently, the score is the sum of the machine basic block frequency-weighed
/// number of loads, stores, copies, and remat instructions, each factored with
/// a relative weight.
//===----------------------------------------------------------------------===//

#include "RegAllocScore.h"
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/STLForwardCompat.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/ilist_iterator.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBundleIterator.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/CommandLine.h"

using namespace llvm;
cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden);
cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0), cl::Hidden);
cl::opt<double> StoreWeight("regalloc-store-weight", cl::init(1.0), cl::Hidden);
cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight", cl::init(0.2),
                                 cl::Hidden);
cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
                                     cl::init(1.0), cl::Hidden);
#define DEBUG_TYPE "regalloc-score"

RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {
  CopyCounts += Other.copyCounts();
  LoadCounts += Other.loadCounts();
  StoreCounts += Other.storeCounts();
  LoadStoreCounts += Other.loadStoreCounts();
  CheapRematCounts += Other.cheapRematCounts();
  ExpensiveRematCounts += Other.expensiveRematCounts();
  return *this;
}

bool RegAllocScore::operator==(const RegAllocScore &Other) const {
  return copyCounts() == Other.copyCounts() &&
         loadCounts() == Other.loadCounts() &&
         storeCounts() == Other.storeCounts() &&
         loadStoreCounts() == Other.loadStoreCounts() &&
         cheapRematCounts() == Other.cheapRematCounts() &&
         expensiveRematCounts() == Other.expensiveRematCounts();
}

bool RegAllocScore::operator!=(const RegAllocScore &Other) const {
  return !(*this == Other);
}

double RegAllocScore::getScore() const {
  double Ret = 0.0;
  Ret += CopyWeight * copyCounts();
  Ret += LoadWeight * loadCounts();
  Ret += StoreWeight * storeCounts();
  Ret += (LoadWeight + StoreWeight) * loadStoreCounts();
  Ret += CheapRematWeight * cheapRematCounts();
  Ret += ExpensiveRematWeight * expensiveRematCounts();

  return Ret;
}

RegAllocScore
llvm::calculateRegAllocScore(const MachineFunction &MF,
                             const MachineBlockFrequencyInfo &MBFI,
                             AAResults &AAResults) {
  return calculateRegAllocScore(
      MF,
      [&](const MachineBasicBlock &MBB) {
        return MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
      },
      [&](const MachineInstr &MI) {
        return MF.getSubtarget().getInstrInfo()->isTriviallyReMaterializable(
            MI, &AAResults);
      });
}

RegAllocScore llvm::calculateRegAllocScore(
    const MachineFunction &MF,
    llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq,
    llvm::function_ref<bool(const MachineInstr &)>
        IsTriviallyRematerializable) {
  RegAllocScore Total;

  for (const MachineBasicBlock &MBB : MF) {
    double BlockFreqRelativeToEntrypoint = GetBBFreq(MBB);
    RegAllocScore MBBScore;

    for (const MachineInstr &MI : MBB) {
      if (MI.isDebugInstr() || MI.isKill() || MI.isInlineAsm()) {
        continue;
      }
      if (MI.isCopy()) {
        MBBScore.onCopy(BlockFreqRelativeToEntrypoint);
      } else if (IsTriviallyRematerializable(MI)) {
        if (MI.getDesc().isAsCheapAsAMove()) {
          MBBScore.onCheapRemat(BlockFreqRelativeToEntrypoint);
        } else {
          MBBScore.onExpensiveRemat(BlockFreqRelativeToEntrypoint);
        }
      } else if (MI.mayLoad() && MI.mayStore()) {
        MBBScore.onLoadStore(BlockFreqRelativeToEntrypoint);
      } else if (MI.mayLoad()) {
        MBBScore.onLoad(BlockFreqRelativeToEntrypoint);
      } else if (MI.mayStore()) {
        MBBScore.onStore(BlockFreqRelativeToEntrypoint);
      }
    }
    Total += MBBScore;
  }
  return Total;
}
[mlgo][regalloc] Add score calculation for training Add the calculation of a score, which will be used during ML training. The score qualifies the quality of a regalloc policy, and is independent of what we train (currently, just eviction), or the regalloc algo itself. We can then use scores to guide training (which happens offline), by formulating a reward based on score variation - the goal being lowering scores (currently, that reward is percentage reduction relative to Greedy's heuristic) Currently, we compute the score by factoring different instruction counts (loads, stores, etc) with the machine basic block frequency, regardless of the instructions' provenance - i.e. they could be due to the regalloc policy or be introduced previously. This is different from RAGreedy::reportStats, which accummulates the effects of the allocator alone. We explored this alternative but found (at least currently) that the more naive alternative introduced here produces better policies. We do intend to consolidate the two, however, as we are actively investigating improvements to our reward function, and will likely want to re-explore scoring just the effects of the allocator. In either case, we want to decouple score calculation from allocation algorighm, as we currently evaluate it after a few more passes after allocation (also, because score calculation should be reusable regardless of allocation algorithm). We intentionally accummulate counts independently because it facilitates per-block reporting, which we found useful for debugging - for instance, we can easily report the counts indepdently, and then cross-reference with perf counter measurements. Differential Revision: https://reviews.llvm.org/D115195 2021-12-07 06:59:19 +08:00			`//===- RegAllocScore.cpp - evaluate regalloc policy quality ---------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`/// Calculate a measure of the register allocation policy quality. This is used`
			`/// to construct a reward for the training of the ML-driven allocation policy.`
			`/// Currently, the score is the sum of the machine basic block frequency-weighed`
			`/// number of loads, stores, copies, and remat instructions, each factored with`
			`/// a relative weight.`
			`//===----------------------------------------------------------------------===//`

			`#include "RegAllocScore.h"`
Cleanup codegen includes This is a (fixed) recommit of https://reviews.llvm.org/D121169 after: 1061034926 before: 1063332844 Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup Differential Revision: https://reviews.llvm.org/D121681 2022-03-15 17:54:19 +08:00			`#include "llvm/ADT/DenseMapInfo.h"`
Cleanup includes: DebugInfo & CodeGen Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup Differential Revision: https://reviews.llvm.org/D121332 2022-03-10 05:29:31 +08:00			`#include "llvm/ADT/STLForwardCompat.h"`
Cleanup codegen includes This is a (fixed) recommit of https://reviews.llvm.org/D121169 after: 1061034926 before: 1063332844 Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup Differential Revision: https://reviews.llvm.org/D121681 2022-03-15 17:54:19 +08:00			`#include "llvm/ADT/SetVector.h"`
Cleanup includes: DebugInfo & CodeGen Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup Differential Revision: https://reviews.llvm.org/D121332 2022-03-10 05:29:31 +08:00			`#include "llvm/ADT/ilist_iterator.h"`
			`#include "llvm/CodeGen/MachineBasicBlock.h"`
			`#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"`
			`#include "llvm/CodeGen/MachineFunction.h"`
			`#include "llvm/CodeGen/MachineInstr.h"`
			`#include "llvm/CodeGen/MachineInstrBundleIterator.h"`
[mlgo][regalloc] Add score calculation for training Add the calculation of a score, which will be used during ML training. The score qualifies the quality of a regalloc policy, and is independent of what we train (currently, just eviction), or the regalloc algo itself. We can then use scores to guide training (which happens offline), by formulating a reward based on score variation - the goal being lowering scores (currently, that reward is percentage reduction relative to Greedy's heuristic) Currently, we compute the score by factoring different instruction counts (loads, stores, etc) with the machine basic block frequency, regardless of the instructions' provenance - i.e. they could be due to the regalloc policy or be introduced previously. This is different from RAGreedy::reportStats, which accummulates the effects of the allocator alone. We explored this alternative but found (at least currently) that the more naive alternative introduced here produces better policies. We do intend to consolidate the two, however, as we are actively investigating improvements to our reward function, and will likely want to re-explore scoring just the effects of the allocator. In either case, we want to decouple score calculation from allocation algorighm, as we currently evaluate it after a few more passes after allocation (also, because score calculation should be reusable regardless of allocation algorithm). We intentionally accummulate counts independently because it facilitates per-block reporting, which we found useful for debugging - for instance, we can easily report the counts indepdently, and then cross-reference with perf counter measurements. Differential Revision: https://reviews.llvm.org/D115195 2021-12-07 06:59:19 +08:00			`#include "llvm/CodeGen/TargetInstrInfo.h"`
Cleanup codegen includes This is a (fixed) recommit of https://reviews.llvm.org/D121169 after: 1061034926 before: 1063332844 Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup Differential Revision: https://reviews.llvm.org/D121681 2022-03-15 17:54:19 +08:00			`#include "llvm/CodeGen/TargetSubtargetInfo.h"`
			`#include "llvm/MC/MCInstrDesc.h"`
			`#include "llvm/Support/CommandLine.h"`
[mlgo][regalloc] Add score calculation for training Add the calculation of a score, which will be used during ML training. The score qualifies the quality of a regalloc policy, and is independent of what we train (currently, just eviction), or the regalloc algo itself. We can then use scores to guide training (which happens offline), by formulating a reward based on score variation - the goal being lowering scores (currently, that reward is percentage reduction relative to Greedy's heuristic) Currently, we compute the score by factoring different instruction counts (loads, stores, etc) with the machine basic block frequency, regardless of the instructions' provenance - i.e. they could be due to the regalloc policy or be introduced previously. This is different from RAGreedy::reportStats, which accummulates the effects of the allocator alone. We explored this alternative but found (at least currently) that the more naive alternative introduced here produces better policies. We do intend to consolidate the two, however, as we are actively investigating improvements to our reward function, and will likely want to re-explore scoring just the effects of the allocator. In either case, we want to decouple score calculation from allocation algorighm, as we currently evaluate it after a few more passes after allocation (also, because score calculation should be reusable regardless of allocation algorithm). We intentionally accummulate counts independently because it facilitates per-block reporting, which we found useful for debugging - for instance, we can easily report the counts indepdently, and then cross-reference with perf counter measurements. Differential Revision: https://reviews.llvm.org/D115195 2021-12-07 06:59:19 +08:00
			`using namespace llvm;`
			`cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden);`
			`cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0), cl::Hidden);`
			`cl::opt<double> StoreWeight("regalloc-store-weight", cl::init(1.0), cl::Hidden);`
			`cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight", cl::init(0.2),`
			`cl::Hidden);`
			`cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",`
			`cl::init(1.0), cl::Hidden);`
			`#define DEBUG_TYPE "regalloc-score"`

			`RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {`
			`CopyCounts += Other.copyCounts();`
			`LoadCounts += Other.loadCounts();`
			`StoreCounts += Other.storeCounts();`
			`LoadStoreCounts += Other.loadStoreCounts();`
			`CheapRematCounts += Other.cheapRematCounts();`
			`ExpensiveRematCounts += Other.expensiveRematCounts();`
			`return *this;`
			`}`

			`bool RegAllocScore::operator==(const RegAllocScore &Other) const {`
			`return copyCounts() == Other.copyCounts() &&`
			`loadCounts() == Other.loadCounts() &&`
			`storeCounts() == Other.storeCounts() &&`
			`loadStoreCounts() == Other.loadStoreCounts() &&`
			`cheapRematCounts() == Other.cheapRematCounts() &&`
			`expensiveRematCounts() == Other.expensiveRematCounts();`
			`}`

			`bool RegAllocScore::operator!=(const RegAllocScore &Other) const {`
			`return !(*this == Other);`
			`}`

			`double RegAllocScore::getScore() const {`
			`double Ret = 0.0;`
			`Ret += CopyWeight * copyCounts();`
			`Ret += LoadWeight * loadCounts();`
			`Ret += StoreWeight * storeCounts();`
			`Ret += (LoadWeight + StoreWeight) * loadStoreCounts();`
			`Ret += CheapRematWeight * cheapRematCounts();`
			`Ret += ExpensiveRematWeight * expensiveRematCounts();`

			`return Ret;`
			`}`

			`RegAllocScore`
			`llvm::calculateRegAllocScore(const MachineFunction &MF,`
			`const MachineBlockFrequencyInfo &MBFI,`
			`AAResults &AAResults) {`
			`return calculateRegAllocScore(`
			`MF,`
			`[&](const MachineBasicBlock &MBB) {`
			`return MBFI.getBlockFreqRelativeToEntryBlock(&MBB);`
			`},`
			`[&](const MachineInstr &MI) {`
			`return MF.getSubtarget().getInstrInfo()->isTriviallyReMaterializable(`
			`MI, &AAResults);`
			`});`
			`}`

			`RegAllocScore llvm::calculateRegAllocScore(`
			`const MachineFunction &MF,`
			`llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq,`
			`llvm::function_ref<bool(const MachineInstr &)>`
			`IsTriviallyRematerializable) {`
			`RegAllocScore Total;`

			`for (const MachineBasicBlock &MBB : MF) {`
			`double BlockFreqRelativeToEntrypoint = GetBBFreq(MBB);`
			`RegAllocScore MBBScore;`

			`for (const MachineInstr &MI : MBB) {`
			`if (MI.isDebugInstr() \|\| MI.isKill() \|\| MI.isInlineAsm()) {`
			`continue;`
			`}`
			`if (MI.isCopy()) {`
			`MBBScore.onCopy(BlockFreqRelativeToEntrypoint);`
			`} else if (IsTriviallyRematerializable(MI)) {`
			`if (MI.getDesc().isAsCheapAsAMove()) {`
			`MBBScore.onCheapRemat(BlockFreqRelativeToEntrypoint);`
			`} else {`
			`MBBScore.onExpensiveRemat(BlockFreqRelativeToEntrypoint);`
			`}`
			`} else if (MI.mayLoad() && MI.mayStore()) {`
			`MBBScore.onLoadStore(BlockFreqRelativeToEntrypoint);`
			`} else if (MI.mayLoad()) {`
			`MBBScore.onLoad(BlockFreqRelativeToEntrypoint);`
			`} else if (MI.mayStore()) {`
			`MBBScore.onStore(BlockFreqRelativeToEntrypoint);`
			`}`
			`}`
			`Total += MBBScore;`
			`}`
			`return Total;`
			`}`