[DSE] Support looking through memory phis at end of function.

Update isWriteAtEndOfFunction to look through MemoryPhis. The reason
MemoryPhis were skipped so far was the known AliasAnalysis issue with it
missing loop-carried dependences.

This problem is already addressed in other parts of the code by skipping
MemoryDefs that may be in difference loops. I think the same logic can
be applied here.

This can have a substantial impact on the number of stores removed in
some cases. For MultiSource/SPEC2006/SPEC2017 with -O3:

```
Metric: dse.NumFastStores

Program                                       dse.NumFastStores
                                              base              patch   diff
External/S...CINT2017rate/557.xz_r/557.xz_r     14.00             45.00 221.4%
External/S...te/538.imagick_r/538.imagick_r    439.00           1267.00 188.6%
MultiSourc...e/Applications/SIBsim4/SIBsim4      6.00             15.00 150.0%
MultiSourc...Prolangs-C/simulator/simulator      3.00              7.00 133.3%
MultiSource/Applications/siod/siod               3.00              7.00 133.3%
MultiSourc...arks/FreeBench/distray/distray      6.00              9.00  50.0%
MultiSourc...e/Applications/obsequi/Obsequi     22.00             30.00  36.4%
MultiSource/Benchmarks/Ptrdist/bc/bc            23.00             28.00  21.7%
External/S...NT2017rate/502.gcc_r/502.gcc_r   1258.00           1512.00  20.2%
External/S...te/520.omnetpp_r/520.omnetpp_r    954.00           1143.00  19.8%
External/S...rate/510.parest_r/510.parest_r   5961.00           7122.00  19.5%
External/S...C/CINT2006/445.gobmk/445.gobmk     47.00             56.00  19.1%
External/S...00.perlbench_r/500.perlbench_r    241.00            286.00  18.7%
External/S...NT2006/471.omnetpp/471.omnetpp     36.00             42.00  16.7%
External/S...06/400.perlbench/400.perlbench    183.00            210.00  14.8%
MultiSource/Applications/SPASS/SPASS            72.00             81.00  12.5%
External/S...17rate/541.leela_r/541.leela_r     72.00             80.00  11.1%
External/SPEC/CINT2006/403.gcc/403.gcc         585.00            642.00   9.7%
MultiSourc...e/Applications/sqlite3/sqlite3    120.00            131.00   9.2%
MultiSourc...Applications/hexxagon/hexxagon     11.00             12.00   9.1%
External/S.../CFP2006/453.povray/453.povray    566.00            615.00   8.7%
External/S...rate/511.povray_r/511.povray_r    578.00            627.00   8.5%
External/S...FP2006/482.sphinx3/482.sphinx3     12.00             13.00   8.3%
MultiSource/Applications/oggenc/oggenc         130.00            140.00   7.7%
MultiSourc...e/Applications/ClamAV/clamscan    250.00            268.00   7.2%
MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg     19.00             20.00   5.3%
MultiSourc...ch/consumer-jpeg/consumer-jpeg     19.00             20.00   5.3%
External/S...te/526.blender_r/526.blender_r   3747.00           3928.00   4.8%
MultiSourc...OE-ProxyApps-C++/miniFE/miniFE    104.00            108.00   3.8%
MultiSourc...ch/consumer-lame/consumer-lame     54.00             56.00   3.7%
MultiSource/Benchmarks/Bullet/bullet          1222.00           1264.00   3.4%
MultiSourc...nchmarks/tramp3d-v4/tramp3d-v4    973.00           1005.00   3.3%
External/S.../CFP2006/447.dealII/447.dealII   2699.00           2780.00   3.0%
External/S...06/483.xalancbmk/483.xalancbmk    788.00            810.00   2.8%
External/S.../CFP2006/450.soplex/450.soplex    180.00            185.00   2.8%
MultiSourc.../DOE-ProxyApps-C++/CLAMR/CLAMR    338.00            345.00   2.1%
MultiSourc...Benchmarks/7zip/7zip-benchmark    685.00            699.00   2.0%
External/S...FP2017rate/544.nab_r/544.nab_r    158.00            160.00   1.3%
MultiSourc...sumer-typeset/consumer-typeset    772.00            781.00   1.2%
External/S...2017rate/525.x264_r/525.x264_r    410.00            414.00   1.0%
External/S...23.xalancbmk_r/523.xalancbmk_r    998.00           1002.00   0.4%
```

Compile-time is almost neutral:

https://llvm-compile-time-tracker.com/compare.php?from=b3125ad3d60531a97eea20009cc9629a87755862&to=84007eee59004f43464eda7f5ba8263ed5158df8&stat=instructions

NewPM-O3: +0.03%
NewPM-ReleaseThinLTO: -0.01%
NewPM-ReleaseLTO-g: +0.03%

Reviewed By: asbirlea

Differential Revision: https://reviews.llvm.org/D132365
This commit is contained in:
Florian Hahn 2022-08-30 13:27:50 +01:00
parent 23dec4a352
commit b5e208fcba
No known key found for this signature in database
GPG Key ID: EEF712BB5E80EBBA
4 changed files with 13 additions and 25 deletions

View File

@ -1075,13 +1075,16 @@ struct DSEState {
}
MemoryAccess *UseAccess = WorkList[I];
// Simply adding the users of MemoryPhi to the worklist is not enough,
// because we might miss read clobbers in different iterations of a loop,
// for example.
// TODO: Add support for phi translation to handle the loop case.
if (isa<MemoryPhi>(UseAccess))
return false;
if (isa<MemoryPhi>(UseAccess)) {
// AliasAnalysis does not account for loops. Limit elimination to
// candidates for which we can guarantee they always store to the same
// memory location.
if (!isGuaranteedLoopInvariant(MaybeLoc->Ptr))
return false;
PushMemUses(cast<MemoryPhi>(UseAccess));
continue;
}
// TODO: Checking for aliasing is expensive. Consider reducing the amount
// of times this is called and/or caching it.
Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();

View File

@ -138,8 +138,6 @@ define void @alloca_1(i1 %c) {
; CHECK: bb1:
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: bb2:
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void
@ -177,12 +175,8 @@ define void @alloca_2(i1 %c) {
; CHECK-NEXT: call void @readonly_use(i32* [[P]])
; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: bb2:
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX2]], align 4
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: ret void

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -dse -S %s | FileCheck %s
; TODO: Both the stores in %then and %else can be eliminated by translating %p
; Both the stores in %then and %else can be eliminated by translating %p
; through the phi.
define void @memoryphi_translate_1(i1 %c) {
; CHECK-LABEL: @memoryphi_translate_1(
@ -10,10 +10,8 @@ define void @memoryphi_translate_1(i1 %c) {
; CHECK-NEXT: [[A_2:%.*]] = alloca i8, align 1
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
; CHECK: then:
; CHECK-NEXT: store i8 0, i8* [[A_1]], align 1
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: else:
; CHECK-NEXT: store i8 9, i8* [[A_2]], align 1
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[P:%.*]] = phi i8* [ [[A_1]], [[THEN]] ], [ [[A_2]], [[ELSE]] ]
@ -39,7 +37,7 @@ end:
ret void
}
; TODO: The store in %else can be eliminated by translating %p through the phi.
; The store in %else can be eliminated by translating %p through the phi.
; The store in %then cannot be eliminated, because %a.1 is read before the final
; store.
define i8 @memoryphi_translate_2(i1 %c) {
@ -52,7 +50,6 @@ define i8 @memoryphi_translate_2(i1 %c) {
; CHECK-NEXT: store i8 0, i8* [[A_1]], align 1
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: else:
; CHECK-NEXT: store i8 9, i8* [[A_2]], align 1
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[P:%.*]] = phi i8* [ [[A_1]], [[THEN]] ], [ [[A_2]], [[ELSE]] ]
@ -80,7 +77,7 @@ end:
ret i8 %l
}
; TODO: The store in %then can be eliminated by translating %p through the phi.
; The store in %then can be eliminated by translating %p through the phi.
; The store in %else cannot be eliminated, because %a.2 is read before the final
; store.
define i8 @memoryphi_translate_3(i1 %c) {
@ -90,7 +87,6 @@ define i8 @memoryphi_translate_3(i1 %c) {
; CHECK-NEXT: [[A_2:%.*]] = alloca i8, align 1
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
; CHECK: then:
; CHECK-NEXT: store i8 0, i8* [[A_1]], align 1
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: else:
; CHECK-NEXT: store i8 9, i8* [[A_2]], align 1
@ -166,11 +162,9 @@ define void @memoryphi_translate_5(i1 %cond) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
; CHECK-NEXT: [[B:%.*]] = alloca i8, align 1
; CHECK-NEXT: [[C:%.*]] = alloca i8, align 1
; CHECK-NEXT: store i8 0, i8* [[A]], align 1
; CHECK-NEXT: br i1 [[COND:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
; CHECK: cond.true:
; CHECK-NEXT: store i8 0, i8* [[C]], align 1
; CHECK-NEXT: br label [[COND_END]]
; CHECK: cond.end:
; CHECK-NEXT: [[P:%.*]] = phi i8* [ [[B]], [[COND_TRUE]] ], [ [[A]], [[ENTRY:%.*]] ]

View File

@ -225,12 +225,9 @@ define i8 @test4_read_between(i8 *%P) {
define void @test4_non_local(i8 *%P, i1 %c) {
; CHECK-LABEL: @test4_non_local(
; CHECK-NEXT: [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
; CHECK-NEXT: [[A2:%.*]] = bitcast %1* [[A1]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
; CHECK-NEXT: br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
; CHECK: call:
; CHECK-NEXT: call void @test4a(i8* byval(i8) align 1 [[P]])
; CHECK-NEXT: call void @test4a(i8* byval(i8) align 1 [[P:%.*]])
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void