[InstCombine] improve demanded element analysis for vector insert-of-extract (2nd try)

The 1st attempt (rG557b890) was reverted because it caused miscompiles.
That bug is avoided here by changing the order of folds and as verified
in the new tests.

Original commit message:
InstCombine currently has odd rules for folding insert-extract chains to shuffles,
so we miss collapsing seemingly simple cases as shown in the tests here.

But poison makes this not quite as easy as we might have guessed. Alive2 tests to
show the subtle difference (similar to the regression tests):
https://alive2.llvm.org/ce/z/hp4hv3 (this is ok)
https://alive2.llvm.org/ce/z/ehEWaN (poison leakage)

SLP tends to create these patterns (as shown in the SLP tests), and this could
help with solving PR16739.

Differential Revision: https://reviews.llvm.org/D86460
This commit is contained in:
Sanjay Patel 2020-08-25 11:11:43 -04:00
parent 11f8d4aa10
commit c4f0a0896f
4 changed files with 42 additions and 39 deletions

View File

@ -1155,6 +1155,19 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
if (IdxNo < VWidth)
PreInsertDemandedElts.clearBit(IdxNo);
// If we only demand the element that is being inserted and that element
// was extracted from the same index in another vector with the same type,
// replace this insert with that other vector.
// Note: This is attempted before the call to simplifyAndSetOp because that
// may change UndefElts to a value that does not match with Vec.
Value *Vec;
if (PreInsertDemandedElts == 0 &&
match(I->getOperand(1),
m_ExtractElt(m_Value(Vec), m_SpecificInt(IdxNo))) &&
Vec->getType() == I->getType()) {
return Vec;
}
simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);
// If this is inserting an element that isn't demanded, remove this

View File

@ -749,9 +749,7 @@ define <4 x i8> @select_cond_(<4 x i8> %x, <4 x i8> %min, <4 x i1> %cmp, i1 %poi
define <4 x float> @ins_of_ext(<4 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0
; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[Y:%.*]], i32 1
; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 1
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y]], i32 2
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3
; CHECK-NEXT: ret <4 x float> [[I3]]
@ -766,11 +764,7 @@ define <4 x float> @ins_of_ext(<4 x float> %x, float %y) {
define <4 x float> @ins_of_ext_twice(<4 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext_twice(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[E0]], i32 0
; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x float> [[X]], i32 1
; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[E1]], i32 1
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[Y:%.*]], i32 2
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[X:%.*]], float [[Y:%.*]], i32 2
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[Y]], i32 3
; CHECK-NEXT: ret <4 x float> [[I3]]
;
@ -783,6 +777,9 @@ define <4 x float> @ins_of_ext_twice(<4 x float> %x, float %y) {
ret <4 x float> %i3
}
; Negative test - element 3 of the result must be undef to be poison safe.
; TODO: Could convert insert/extract to identity shuffle with undef mask elements.
define <4 x float> @ins_of_ext_wrong_demand(<4 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext_wrong_demand(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
@ -798,6 +795,9 @@ define <4 x float> @ins_of_ext_wrong_demand(<4 x float> %x, float %y) {
ret <4 x float> %i2
}
; Negative test - can't replace i0 with x.
; TODO: Could convert insert/extract to identity shuffle with undef mask elements.
define <4 x float> @ins_of_ext_wrong_type(<5 x float> %x, float %y) {
; CHECK-LABEL: @ins_of_ext_wrong_type(
; CHECK-NEXT: [[E0:%.*]] = extractelement <5 x float> [[X:%.*]], i32 0
@ -819,9 +819,7 @@ define <4 x float> @ins_of_ext_wrong_type(<5 x float> %x, float %y) {
define <4 x i4> @ins_of_ext_undef_elts_propagation(<4 x i4> %v, <4 x i4> %v2, i4 %x) {
; CHECK-LABEL: @ins_of_ext_undef_elts_propagation(
; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x i4> [[V:%.*]], i32 0
; CHECK-NEXT: [[T0:%.*]] = insertelement <4 x i4> undef, i4 [[V0]], i32 0
; CHECK-NEXT: [[T2:%.*]] = insertelement <4 x i4> [[T0]], i4 [[X:%.*]], i32 2
; CHECK-NEXT: [[T2:%.*]] = insertelement <4 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i4> [[T2]], <4 x i4> [[V2:%.*]], <4 x i32> <i32 0, i32 6, i32 2, i32 7>
; CHECK-NEXT: ret <4 x i4> [[R]]
;
@ -836,11 +834,7 @@ define <4 x i4> @ins_of_ext_undef_elts_propagation(<4 x i4> %v, <4 x i4> %v2, i4
define <8 x i4> @ins_of_ext_undef_elts_propagation2(<8 x i4> %v, <8 x i4> %v2, i4 %x) {
; CHECK-LABEL: @ins_of_ext_undef_elts_propagation2(
; CHECK-NEXT: [[I15:%.*]] = extractelement <8 x i4> [[V:%.*]], i32 0
; CHECK-NEXT: [[I16:%.*]] = insertelement <8 x i4> undef, i4 [[I15]], i32 0
; CHECK-NEXT: [[I17:%.*]] = extractelement <8 x i4> [[V]], i32 1
; CHECK-NEXT: [[I18:%.*]] = insertelement <8 x i4> [[I16]], i4 [[I17]], i32 1
; CHECK-NEXT: [[I19:%.*]] = insertelement <8 x i4> [[I18]], i4 [[X:%.*]], i32 2
; CHECK-NEXT: [[I19:%.*]] = insertelement <8 x i4> [[V:%.*]], i4 [[X:%.*]], i32 2
; CHECK-NEXT: [[I20:%.*]] = shufflevector <8 x i4> [[I19]], <8 x i4> [[V2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 10, i32 9, i32 8, i32 undef>
; CHECK-NEXT: [[I21:%.*]] = shufflevector <8 x i4> [[I20]], <8 x i4> [[V]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
; CHECK-NEXT: ret <8 x i4> [[I21]]

View File

@ -51,13 +51,13 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP20:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
; CHECK: for.body:
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP20]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
@ -83,12 +83,11 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
; CHECK-NEXT: [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]]
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ADD11]], i32 1
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
; CHECK-NEXT: [[TMP20]] = add nsw <2 x i32> [[TMP18]], [[TMP19]]
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP21]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
entry:
@ -157,13 +156,13 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP17:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
; CHECK: for.body:
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP4]], 1
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[T4]] to i64
@ -188,12 +187,11 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[ADD11]], i32 1
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
; CHECK-NEXT: [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]]
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]]
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[ADD11]], i32 1
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
; CHECK-NEXT: [[TMP17]] = add nsw <2 x i32> [[TMP15]], [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP18]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
entry:

View File

@ -183,13 +183,11 @@ define void @vecload_vs_broadcast5(double * noalias %from, double * noalias %to,
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void