PGO: Scale large counters down to 32-bits

PGO counters are 64-bit and branch weights are 32-bit. Scale them down when necessary, instead of just taking the lower 32 bits. <rdar://problem/16276448> llvm-svn: 203592
2014-03-11 18:18:10 +00:00 · 2014-03-11 18:18:10 +00:00 · 38402dc917
parent f164d9404d
commit 38402dc917
3 changed files with 101 additions and 12 deletions
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@ -872,29 +872,59 @@ void CodeGenPGO::destroyRegionCounters() {
    delete RegionCounts;
 }

+/// \brief Calculate what to divide by to scale weights.
+///
+/// Given the maximum weight, calculate a divisor that will scale all the
+/// weights to strictly less than UINT32_MAX.
+static uint64_t calculateWeightScale(uint64_t MaxWeight) {
+  return MaxWeight < UINT32_MAX ? 1 : MaxWeight / UINT32_MAX + 1;
+}
+
+/// \brief Scale an individual branch weight (and add 1).
+///
+/// Scale a 64-bit weight down to 32-bits using \c Scale.
+///
+/// According to Laplace's Rule of Succession, it is better to compute the
+/// weight based on the count plus 1, so universally add 1 to the value.
+///
+/// \pre \c Scale was calculated by \a calculateWeightScale() with a weight no
+/// greater than \c Weight.
+static uint32_t scaleBranchWeight(uint64_t Weight, uint64_t Scale) {
+  assert(Scale && "scale by 0?");
+  uint64_t Scaled = Weight / Scale + 1;
+  assert(Scaled <= UINT32_MAX && "overflow 32-bits");
+  return Scaled;
+}
+
 llvm::MDNode *CodeGenPGO::createBranchWeights(uint64_t TrueCount,
                                              uint64_t FalseCount) {
+  // Check for empty weights.
  if (!TrueCount && !FalseCount)
    return 0;

+  // Calculate how to scale down to 32-bits.
+  uint64_t Scale = calculateWeightScale(std::max(TrueCount, FalseCount));
+
  llvm::MDBuilder MDHelper(CGM.getLLVMContext());
-  // TODO: need to scale down to 32-bits
-  // According to Laplace's Rule of Succession, it is better to compute the
-  // weight based on the count plus 1.
-  return MDHelper.createBranchWeights(TrueCount + 1, FalseCount + 1);
+  return MDHelper.createBranchWeights(scaleBranchWeight(TrueCount, Scale),
+                                      scaleBranchWeight(FalseCount, Scale));
 }

 llvm::MDNode *CodeGenPGO::createBranchWeights(ArrayRef<uint64_t> Weights) {
-  llvm::MDBuilder MDHelper(CGM.getLLVMContext());
-  // TODO: need to scale down to 32-bits, instead of just truncating.
-  // According to Laplace's Rule of Succession, it is better to compute the
-  // weight based on the count plus 1.
+  // We need at least two elements to create meaningful weights.
+  if (Weights.size() < 2)
+    return 0;
+
+  // Calculate how to scale down to 32-bits.
+  uint64_t Scale = calculateWeightScale(*std::max_element(Weights.begin(),
+                                                          Weights.end()));
+
  SmallVector<uint32_t, 16> ScaledWeights;
  ScaledWeights.reserve(Weights.size());
-  for (ArrayRef<uint64_t>::iterator WI = Weights.begin(), WE = Weights.end();
-       WI != WE; ++WI) {
-    ScaledWeights.push_back(*WI + 1);
-  }
+  for (uint64_t W : Weights)
+    ScaledWeights.push_back(scaleBranchWeight(W, Scale));
+
+  llvm::MDBuilder MDHelper(CGM.getLLVMContext());
  return MDHelper.createBranchWeights(ScaledWeights);
 }

--- a/clang/test/Profile/Inputs/c-counter-overflows.profdata
+++ b/clang/test/Profile/Inputs/c-counter-overflows.profdata
@ -0,0 +1,10 @@
+main 8
+1
+68719476720
+64424509425
+68719476720
+21474836475
+21474836475
+21474836475
+4294967295
+
--- a/clang/test/Profile/c-counter-overflows.c
+++ b/clang/test/Profile/c-counter-overflows.c
@ -0,0 +1,49 @@
+// Test that big branch weights get scaled down to 32-bits, rather than just
+// truncated.
+
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-counter-overflows.c %s -o - -emit-llvm -fprofile-instr-use=%S/Inputs/c-counter-overflows.profdata | FileCheck %s
+
+#include <stdint.h>
+
+// PGOGEN: @[[MAIN:__llvm_pgo_ctr[0-9]*]] = private global [2 x i64] zeroinitializer
+int main(int argc, const char *argv[]) {
+  // Need counts higher than 32-bits.
+  // CHECK: br {{.*}} !prof ![[FOR:[0-9]+]]
+  // max   = 0xffffffff0
+  // scale = 0xffffffff0 / 0xffffffff + 1 = 17
+  // loop-body: 0xffffffff0 / 17 + 1 = 0xf0f0f0f0 + 1 = 4042322161 => -252645135
+  // loop-exit: 0x000000001 / 17 + 1 = 0x00000000 + 1 =          1 =>          1
+  for (uint64_t I = 0; I < 0xffffffff0; ++I) {
+    // max   = 0xffffffff * 15 = 0xefffffff1
+    // scale = 0xefffffff1 / 0xffffffff + 1 = 16
+    // CHECK: br {{.*}} !prof ![[IF:[0-9]+]]
+    if (I & 0xf) {
+      // 0xefffffff1 / 16 + 1 = 0xefffffff + 1 = 4026531840 => -268435456
+    } else {
+      // 0x0ffffffff / 16 + 1 = 0x0fffffff + 1 =  268435456 =>  268435456
+    }
+
+    // max   = 0xffffffff * 5 = 0x4fffffffb
+    // scale = 0x4fffffffb / 0xffffffff + 1 = 6
+    // CHECK: ], !prof ![[SWITCH:[0-9]+]]
+    switch ((I & 0xf) / 5) {
+    case 0:
+      // 0x4fffffffb / 6 = 0xd5555554 + 1 = 3579139413 => -715827883
+      break;
+    case 1:
+      // 0x4fffffffb / 6 = 0xd5555554 + 1 = 3579139413 => -715827883
+      break;
+    case 2:
+      // 0x4fffffffb / 6 = 0xd5555554 + 1 = 3579139413 => -715827883
+      break;
+    default:
+      // 0x0ffffffff / 6 = 0x2aaaaaaa + 1 =  715827883 =>  715827883
+      break;
+    }
+  }
+  return 0;
+}
+
+// CHECK-DAG: ![[FOR]] = metadata !{metadata !"branch_weights", i32 -252645135, i32 1}
+// CHECK-DAG: ![[IF]]  = metadata !{metadata !"branch_weights", i32 -268435456, i32 268435456}
+// CHECK-DAG: ![[SWITCH]] = metadata !{metadata !"branch_weights", i32 715827883, i32 -715827883, i32 -715827883, i32 -715827883}