[GlobalOpt] Perform store->dominated load forwarding for stored once globals

The initial land incorrectly optimized forwarding non-Constants in non-nosync/norecurse functions. Bail on non-Constants since norecurse should cause global -> alloca promotion anyway.

The initial land also incorrectly assumed that StoredOnceStore was the only store to the global, but it actually means that only one value other than the global initializer is stored. Add a check that there's only one store.

Compile time tracker:
https://llvm-compile-time-tracker.com/compare.php?from=c80b88ee29f34078d2149de94e27600093e6c7c0&to=ef2c2b7772424b6861a75e794f3c31b45167304a&stat=instructions

Reviewed By: nikic, asbirlea, jdoerfert

Differential Revision: https://reviews.llvm.org/D128128
This commit is contained in:
Arthur Eubanks 2022-06-18 14:14:04 -07:00
parent d3cbcc4e89
commit e422c0d3b2
7 changed files with 63 additions and 13 deletions

View File

@ -35,6 +35,9 @@ struct GlobalStatus {
/// can be deleted.
bool IsLoaded = false;
/// Number of stores to the global.
unsigned NumStores = 0;
/// Keep track of what stores to the global look like.
enum StoredType {
/// There is no store to this global. It can thus be marked constant.

View File

@ -1444,6 +1444,42 @@ static void makeAllConstantUsesInstructions(Constant *C) {
}
}
// For a global variable with one store, if the store dominates any loads,
// those loads will always load the stored value (as opposed to the
// initializer), even in the presence of recursion.
static bool forwardStoredOnceStore(
GlobalVariable *GV, const StoreInst *StoredOnceStore,
function_ref<DominatorTree &(Function &)> LookupDomTree) {
const Value *StoredOnceValue = StoredOnceStore->getValueOperand();
// We can do this optimization for non-constants in nosync + norecurse
// functions, but globals used in exactly one norecurse functions are already
// promoted to an alloca.
if (!isa<Constant>(StoredOnceValue))
return false;
const Function *F = StoredOnceStore->getFunction();
SmallVector<LoadInst *> Loads;
for (User *U : GV->users()) {
if (auto *LI = dyn_cast<LoadInst>(U)) {
if (LI->getFunction() == F &&
LI->getType() == StoredOnceValue->getType() && LI->isSimple())
Loads.push_back(LI);
}
}
// Only compute DT if we have any loads to examine.
bool MadeChange = false;
if (!Loads.empty()) {
auto &DT = LookupDomTree(*const_cast<Function *>(F));
for (auto *LI : Loads) {
if (DT.dominates(StoredOnceStore, LI)) {
LI->replaceAllUsesWith(const_cast<Value *>(StoredOnceValue));
LI->eraseFromParent();
MadeChange = true;
}
}
}
return MadeChange;
}
/// Analyze the specified global variable and optimize
/// it if possible. If we make a change, return true.
static bool
@ -1603,6 +1639,12 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
if (optimizeOnceStoredGlobal(GV, StoredOnceValue, DL, GetTLI))
return true;
// Try to forward the store to any loads. If we have more than one store, we
// may have a store of the initializer between StoredOnceStore and a load.
if (GS.NumStores == 1)
if (forwardStoredOnceStore(GV, GS.StoredOnceStore, LookupDomTree))
return true;
// Otherwise, if the global was not a boolean, we can shrink it to be a
// boolean. Skip this optimization for AS that doesn't allow an initializer.
if (SOVConstant && GS.Ordering == AtomicOrdering::NotAtomic &&

View File

@ -104,6 +104,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
if (SI->isVolatile())
return true;
++GS.NumStores;
GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
// If this is a direct store to the global (i.e., the global is a scalar

View File

@ -2,21 +2,24 @@
@foo = internal global i32 0, align 4
define dso_local i32 @bar() {
define void @store() {
entry:
store i32 5, i32* @foo, align 4
ret void
}
define i32 @bar() {
entry:
%0 = load i32, i32* @foo, align 4
ret i32 %0
}
;CHECK: @bar
;CHECK-NEXT: entry:
;CHECK-NEXT: store i1 true, i1* @foo, align 1, !dbg ![[DbgLocStore:[0-9]+]]
;CHECK-NEXT: %.b = load i1, i1* @foo, align 1, !dbg ![[DbgLocLoadSel:[0-9]+]]
;CHECK-NEXT: %0 = select i1 %.b, i32 5, i32 0, !dbg ![[DbgLocLoadSel]]
;CHECK-NEXT: call void @llvm.dbg.value({{.*}}), !dbg ![[DbgLocLoadSel]]
;CHECK-NEXT: ret i32 %0, !dbg ![[DbgLocRet:[0-9]+]]
;CHECK: ![[DbgLocStore]] = !DILocation(line: 1,
;CHECK: ![[DbgLocLoadSel]] = !DILocation(line: 2,
;CHECK: ![[DbgLocRet]] = !DILocation(line: 3,
;CHECK: ![[DbgLocLoadSel]] = !DILocation(line: 3,
;CHECK: ![[DbgLocRet]] = !DILocation(line: 4,

View File

@ -10,11 +10,13 @@
; Negative test for AS(3). Skip shrink global to bool optimization.
; CHECK: @lvar = internal unnamed_addr addrspace(3) global i32 undef
define void @test_global_var() {
define void @test_global_var(i1 %i) {
; CHECK-LABEL: @test_global_var(
; CHECK: store volatile i32 10, i32* undef, align 4
;
entry:
br i1 %i, label %bb1, label %exit
bb1:
store i32 10, i32* @gvar
br label %exit
exit:
@ -23,13 +25,15 @@ exit:
ret void
}
define void @test_lds_var() {
define void @test_lds_var(i1 %i) {
; CHECK-LABEL: @test_lds_var(
; CHECK: store i32 10, i32 addrspace(3)* @lvar, align 4
; CHECK: [[LD:%.*]] = load i32, i32 addrspace(3)* @lvar, align 4
; CHECK: store volatile i32 [[LD]], i32* undef, align 4
;
entry:
br i1 %i, label %bb1, label %exit
bb1:
store i32 10, i32 addrspace(3)* @lvar
br label %exit
exit:

View File

@ -15,10 +15,8 @@ declare void @b()
define i1 @dom_const() {
; CHECK-LABEL: @dom_const(
; CHECK-NEXT: store i1 true, ptr @g1, align 1
; CHECK-NEXT: call void @b()
; CHECK-NEXT: [[R:%.*]] = load i1, ptr @g1, align 1
; CHECK-NEXT: ret i1 [[R]]
; CHECK-NEXT: ret i1 true
;
store i1 true, ptr @g1
call void @b()
@ -90,8 +88,7 @@ define i1 @dom_multiple_function_loads() {
; CHECK-LABEL: @dom_multiple_function_loads(
; CHECK-NEXT: store i1 true, ptr @g6, align 1
; CHECK-NEXT: call void @b()
; CHECK-NEXT: [[R:%.*]] = load i1, ptr @g6, align 1
; CHECK-NEXT: ret i1 [[R]]
; CHECK-NEXT: ret i1 true
;
store i1 true, ptr @g6
call void @b()

View File

@ -9,7 +9,6 @@
define i32 @main() {
; CHECK-LABEL: @main(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i1 true, i1* @a, align 4
; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** @e, align 8
; CHECK-NEXT: store i32 0, i32* [[TMP0]], align 4
; CHECK-NEXT: store i32* null, i32** @e, align 8