[GVN] Enable enable-split-backedge-in-load-pre option by default

This option was added in D89854. It prevents GVN from performing
load PRE in a loop, if doing so would require critical edge
splitting on the backedge. From the review:

> I know that GVN Load PRE negatively impacts peeling,
> loop predication, so the passes expecting that latch has
> a conditional branch.

In the PhaseOrdering test in this patch, splitting the backedge
negatively affects vectorization: After critical edge splitting,
the loop gets rotated, effectively peeling off the first loop
iteration. The effect is that the first element is handled
separately, then the bulk of the elements use a vectorized
reduction (but using unaligned, off-by-one memory accesses) and
then a tail of 15 elements is handled separately again.

It's probably worth noting that the loop load PRE from D99926 is
not affected by this change (as it does not need backedge
splitting). This is about normal load PRE that happens to occur
inside a loop.

Differential Revision: https://reviews.llvm.org/D126382
This commit is contained in:
Nikita Popov 2022-05-25 16:37:38 +02:00
parent f5fa633b09
commit 1721ff1dfd
7 changed files with 23 additions and 86 deletions

View File

@ -107,7 +107,7 @@ static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
cl::init(true));
static cl::opt<bool>
GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
cl::init(true));
cl::init(false));
static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
static cl::opt<uint32_t> MaxNumDeps(

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loops -basic-aa -gvn -enable-load-pre -S | FileCheck %s
; RUN: opt < %s -loops -basic-aa -gvn -enable-split-backedge-in-load-pre -S | FileCheck %s
define dso_local void @test1(i32* nocapture readonly %aa, i32* nocapture %bb) local_unnamed_addr {
; CHECK-LABEL: @test1(

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes='require<domtree>,loop(loop-simplifycfg),gvn' -S %s | FileCheck %s
; RUN: opt -passes='require<domtree>,loop(loop-simplifycfg),gvn' -enable-split-backedge-in-load-pre -S %s | FileCheck %s
define i32 @test_pointer_phi_select_same_object(i32* %ptr, i32* %end) {
; CHECK-LABEL: @test_pointer_phi_select_same_object(

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basic-aa -gvn -S -dce | FileCheck %s --check-prefixes=CHECK,LE
; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32" -basic-aa -gvn -S -dce | FileCheck %s --check-prefixes=CHECK,BE
; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basic-aa -gvn -enable-split-backedge-in-load-pre -S -dce | FileCheck %s --check-prefixes=CHECK,LE
; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32" -basic-aa -gvn -enable-split-backedge-in-load-pre -S -dce | FileCheck %s --check-prefixes=CHECK,BE
;; Trivial RLE test.
define i32 @test0(i32 %V, i32* %P) {

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; Tests that check our handling of volatile instructions encountered
; when scanning for dependencies
; RUN: opt -basic-aa -gvn -S < %s | FileCheck %s
; RUN: opt -basic-aa -gvn -enable-split-backedge-in-load-pre -S < %s | FileCheck %s
; Check that we can bypass a volatile load when searching
; for dependencies of a non-volatile load

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=gvn -S %s | FileCheck %s
; RUN: opt -passes=gvn -enable-split-backedge-in-load-pre -S %s | FileCheck %s
; Test case for PR31651.

View File

@ -11,89 +11,26 @@ define i16 @test(ptr %ptr) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[FIRST:%.*]] = load i8, ptr [[PTR:%.*]], align 1
; CHECK-NEXT: tail call void @use(i8 [[FIRST]]) #[[ATTR2:[0-9]+]]
; CHECK-NEXT: [[VAL_EXT2:%.*]] = zext i8 [[FIRST]] to i16
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> <i16 poison, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 [[VAL_EXT2]], i64 0
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ [[TMP0]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <8 x i16> [ zeroinitializer, [[ENTRY]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i16>
; CHECK-NEXT: [[TMP5]] = add <8 x i16> [[VEC_PHI]], [[TMP3]]
; CHECK-NEXT: [[TMP6]] = add <8 x i16> [[VEC_PHI5]], [[TMP4]]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i16> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i16>
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i16>
; CHECK-NEXT: [[TMP4]] = add <8 x i16> [[VEC_PHI]], [[TMP2]]
; CHECK-NEXT: [[TMP5]] = add <8 x i16> [[VEC_PHI1]], [[TMP3]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_LOOP_CRIT_EDGE:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: loop.loop_crit_edge:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[BIN_RDX]])
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1009
; CHECK-NEXT: [[VAL_PRE:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT]], align 1
; CHECK-NEXT: [[VAL_EXT:%.*]] = zext i8 [[VAL_PRE]] to i16
; CHECK-NEXT: [[ACCUM_NEXT:%.*]] = add i16 [[TMP8]], [[VAL_EXT]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1010
; CHECK-NEXT: [[VAL_PRE_1:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_1]], align 1
; CHECK-NEXT: [[VAL_EXT_1:%.*]] = zext i8 [[VAL_PRE_1]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_1:%.*]] = add i16 [[ACCUM_NEXT]], [[VAL_EXT_1]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1011
; CHECK-NEXT: [[VAL_PRE_2:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_2]], align 1
; CHECK-NEXT: [[VAL_EXT_2:%.*]] = zext i8 [[VAL_PRE_2]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_2:%.*]] = add i16 [[ACCUM_NEXT_1]], [[VAL_EXT_2]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1012
; CHECK-NEXT: [[VAL_PRE_3:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_3]], align 1
; CHECK-NEXT: [[VAL_EXT_3:%.*]] = zext i8 [[VAL_PRE_3]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_3:%.*]] = add i16 [[ACCUM_NEXT_2]], [[VAL_EXT_3]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1013
; CHECK-NEXT: [[VAL_PRE_4:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_4]], align 1
; CHECK-NEXT: [[VAL_EXT_4:%.*]] = zext i8 [[VAL_PRE_4]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_4:%.*]] = add i16 [[ACCUM_NEXT_3]], [[VAL_EXT_4]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1014
; CHECK-NEXT: [[VAL_PRE_5:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_5]], align 1
; CHECK-NEXT: [[VAL_EXT_5:%.*]] = zext i8 [[VAL_PRE_5]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_5:%.*]] = add i16 [[ACCUM_NEXT_4]], [[VAL_EXT_5]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_6:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1015
; CHECK-NEXT: [[VAL_PRE_6:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_6]], align 1
; CHECK-NEXT: [[VAL_EXT_6:%.*]] = zext i8 [[VAL_PRE_6]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_6:%.*]] = add i16 [[ACCUM_NEXT_5]], [[VAL_EXT_6]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_7:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1016
; CHECK-NEXT: [[VAL_PRE_7:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_7]], align 1
; CHECK-NEXT: [[VAL_EXT_7:%.*]] = zext i8 [[VAL_PRE_7]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_7:%.*]] = add i16 [[ACCUM_NEXT_6]], [[VAL_EXT_7]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1017
; CHECK-NEXT: [[VAL_PRE_8:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_8]], align 1
; CHECK-NEXT: [[VAL_EXT_8:%.*]] = zext i8 [[VAL_PRE_8]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_8:%.*]] = add i16 [[ACCUM_NEXT_7]], [[VAL_EXT_8]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1018
; CHECK-NEXT: [[VAL_PRE_9:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_9]], align 1
; CHECK-NEXT: [[VAL_EXT_9:%.*]] = zext i8 [[VAL_PRE_9]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_9:%.*]] = add i16 [[ACCUM_NEXT_8]], [[VAL_EXT_9]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1019
; CHECK-NEXT: [[VAL_PRE_10:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_10]], align 1
; CHECK-NEXT: [[VAL_EXT_10:%.*]] = zext i8 [[VAL_PRE_10]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_10:%.*]] = add i16 [[ACCUM_NEXT_9]], [[VAL_EXT_10]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1020
; CHECK-NEXT: [[VAL_PRE_11:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_11]], align 1
; CHECK-NEXT: [[VAL_EXT_11:%.*]] = zext i8 [[VAL_PRE_11]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_11:%.*]] = add i16 [[ACCUM_NEXT_10]], [[VAL_EXT_11]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1021
; CHECK-NEXT: [[VAL_PRE_12:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_12]], align 1
; CHECK-NEXT: [[VAL_EXT_12:%.*]] = zext i8 [[VAL_PRE_12]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_12:%.*]] = add i16 [[ACCUM_NEXT_11]], [[VAL_EXT_12]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_13:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1022
; CHECK-NEXT: [[VAL_PRE_13:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_13]], align 1
; CHECK-NEXT: [[VAL_EXT_13:%.*]] = zext i8 [[VAL_PRE_13]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_13:%.*]] = add i16 [[ACCUM_NEXT_12]], [[VAL_EXT_13]]
; CHECK-NEXT: [[GEP_PHI_TRANS_INSERT_14:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1023
; CHECK-NEXT: [[VAL_PRE_14:%.*]] = load i8, ptr [[GEP_PHI_TRANS_INSERT_14]], align 1
; CHECK-NEXT: [[VAL_EXT_14:%.*]] = zext i8 [[VAL_PRE_14]] to i16
; CHECK-NEXT: [[ACCUM_NEXT_14:%.*]] = add i16 [[ACCUM_NEXT_13]], [[VAL_EXT_14]]
; CHECK-NEXT: ret i16 [[ACCUM_NEXT_14]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[BIN_RDX]])
; CHECK-NEXT: ret i16 [[TMP7]]
;
entry:
%first = load i8, ptr %ptr