From d6bb96e677759375b2bea00115918b2cb6552f5b Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Thu, 21 Jan 2021 11:15:16 +0700
Subject: [PATCH] [X86] Add experimental option to separately tune alignment of
 innermost loops

We already have an experimental option to tune loop alignment. Its impact
is very wide (and there is a suspicion that it's not always profitable). We want
to have something more narrow to play with. This patch adds similar option that
overrides preferred alignment for innermost loops. This is for experimental
purposes, default values do not change the existing behavior.

Differential Revision: https://reviews.llvm.org/D94895
Reviewed By: pengfei
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 16 +++++
 llvm/lib/Target/X86/X86ISelLowering.h         |  2 +
 .../CodeGen/X86/innermost-loop-alignment.ll   | 59 +++++++++++++++++++
 3 files changed, 77 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/innermost-loop-alignment.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 852078a299b9..7cd17f109935 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -76,6 +77,14 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
         " of the loop header PC will be 0)."),
     cl::Hidden);
 
+static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
+    "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
+    cl::desc(
+        "Sets the preferable loop alignment for experiments (as log2 bytes) "
+        "for innermost loops only. If specified, this option overrides "
+        "alignment set by x86-experimental-pref-loop-alignment."),
+    cl::Hidden);
+
 static cl::opt<bool> MulConstantOptimization(
     "mul-constant-optimization", cl::init(true),
     cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -51696,3 +51705,10 @@ X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
         .getAsInteger(0, StackProbeSize);
   return StackProbeSize;
 }
+
+Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+  if (ML->isInnermost() &&
+      ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
+    return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
+  return TargetLowering::getPrefLoopAlignment();
+}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 8b71c8394c01..76c83b7df9eb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1408,6 +1408,8 @@ namespace llvm {
                                    SDValue Addr, SelectionDAG &DAG)
                                    const override;
 
+    Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
diff --git a/llvm/test/CodeGen/X86/innermost-loop-alignment.ll b/llvm/test/CodeGen/X86/innermost-loop-alignment.ll
new file mode 100644
index 000000000000..fef30fd28716
--- /dev/null
+++ b/llvm/test/CodeGen/X86/innermost-loop-alignment.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s -check-prefix=DEFAULT
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -x86-experimental-pref-innermost-loop-alignment=5 | FileCheck %s -check-prefix=ALIGN32
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -x86-experimental-pref-loop-alignment=5 -x86-experimental-pref-innermost-loop-alignment=6 | FileCheck %s -check-prefix=ALIGN64
+
+declare void @foo()
+
+define void @test(i32 %n, i32 %m) {
+; DEFAULT-LABEL: test:
+; DEFAULT:         .p2align 4, 0x90
+; DEFAULT-NEXT:  .LBB0_1: # %outer
+; DEFAULT-NEXT:    # =>This Loop Header: Depth=1
+; DEFAULT-NEXT:    # Child Loop BB0_2 Depth 2
+; DEFAULT:         .p2align 4, 0x90
+; DEFAULT-NEXT:  .LBB0_2: # %inner
+; DEFAULT-NEXT:    # Parent Loop BB0_1 Depth=1
+
+; ALIGN32-LABEL: test:
+; ALIGN32:         .p2align 4, 0x90
+; ALIGN32-NEXT:  .LBB0_1: # %outer
+; ALIGN32-NEXT:    # =>This Loop Header: Depth=1
+; ALIGN32-NEXT:    # Child Loop BB0_2 Depth 2
+; ALIGN32:         .p2align 5, 0x90
+; ALIGN32-NEXT:  .LBB0_2: # %inner
+; ALIGN32-NEXT:    # Parent Loop BB0_1 Depth=1
+; ALIGN32-NEXT:    # => This Inner Loop Header: Depth=2
+
+; ALIGN64-LABEL: test:
+; ALIGN64:         .p2align 5, 0x90
+; ALIGN64-NEXT:  .LBB0_1: # %outer
+; ALIGN64-NEXT:    # =>This Loop Header: Depth=1
+; ALIGN64-NEXT:    # Child Loop BB0_2 Depth 2
+; ALIGN64:         .p2align 6, 0x90
+; ALIGN64-NEXT:  .LBB0_2: # %inner
+; ALIGN64-NEXT:    # Parent Loop BB0_1 Depth=1
+; ALIGN64-NEXT:    # => This Inner Loop Header: Depth=2
+
+entry:
+  br label %outer
+
+outer:
+  %outer.iv = phi i32 [0, %entry], [%outer.iv.next, %outer_bb]
+  br label %inner
+
+inner:
+  %inner.iv = phi i32 [0, %outer], [%inner.iv.next, %inner]
+  call void @foo()
+  %inner.iv.next = add i32 %inner.iv, 1
+  %inner.cond = icmp ne i32 %inner.iv.next, %m
+  br i1 %inner.cond, label %inner, label %outer_bb
+
+outer_bb:
+  %outer.iv.next = add i32 %outer.iv, 1
+  %outer.cond = icmp ne i32 %outer.iv.next, %n
+  br i1 %outer.cond, label %outer, label %exit
+
+exit:
+  ret void
+}