[LV] Either get invariant condition OR vector condition.

Currently we unconditionally get the first lane of the condition
operand, even if we later use the full vector condition. This can result
in some unnecessary instructions being generated.

Suggested as follow-up in D80219.
This commit is contained in:
Florian Hahn 2020-05-24 14:14:43 +01:00
parent d43fac052e
commit 0deab8a54f
3 changed files with 43 additions and 48 deletions

View File

@ -4422,15 +4422,15 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
// loop. This means that we can't just use the original 'cond' value. // loop. This means that we can't just use the original 'cond' value.
// We have to take the 'vectorized' value and pick the first lane. // We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op. // Instcombine will make this a no-op.
auto *InvarCond =
auto *ScalarCond = State.get(Operands.getOperand(0), {0, 0}); InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Part = 0; Part < UF; ++Part) {
Value *Cond = State.get(Operands.getOperand(0), Part); Value *Cond =
InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
Value *Op0 = State.get(Operands.getOperand(1), Part); Value *Op0 = State.get(Operands.getOperand(1), Part);
Value *Op1 = State.get(Operands.getOperand(2), Part); Value *Op1 = State.get(Operands.getOperand(2), Part);
Value *Sel = Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
VectorLoopValueMap.setVectorValue(&I, Part, Sel); VectorLoopValueMap.setVectorValue(&I, Part, Sel);
addMetadata(Sel, &I); addMetadata(Sel, &I);
} }

View File

@ -27,13 +27,12 @@ define i32 @foo_optsize() #0 {
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP2]])
; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block: ; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph: ; CHECK: scalar.ph:
@ -42,8 +41,8 @@ define i32 @foo_optsize() #0 {
; CHECK: for.body: ; CHECK: for.body:
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
@ -69,13 +68,12 @@ define i32 @foo_optsize() #0 {
; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* ; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef) ; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef)
; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer ; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0 ; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> ; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* ; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP2]])
; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]])
; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 ; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32
; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 ; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; AUTOVF: middle.block: ; AUTOVF: middle.block:
; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AUTOVF: scalar.ph: ; AUTOVF: scalar.ph:
@ -84,8 +82,8 @@ define i32 @foo_optsize() #0 {
; AUTOVF: for.body: ; AUTOVF: for.body:
; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] ; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 ; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
@ -133,13 +131,12 @@ define i32 @foo_minsize() #1 {
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP2]])
; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
; CHECK: middle.block: ; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph: ; CHECK: scalar.ph:
@ -148,8 +145,8 @@ define i32 @foo_minsize() #1 {
; CHECK: for.body: ; CHECK: for.body:
; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
@ -175,13 +172,12 @@ define i32 @foo_minsize() #1 {
; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* ; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef) ; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef)
; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer ; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0 ; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> ; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* ; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP2]])
; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]])
; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 ; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32
; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 ; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
; AUTOVF: middle.block: ; AUTOVF: middle.block:
; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AUTOVF: scalar.ph: ; AUTOVF: scalar.ph:
@ -190,8 +186,8 @@ define i32 @foo_minsize() #1 {
; AUTOVF: for.body: ; AUTOVF: for.body:
; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] ; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 ; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1

View File

@ -55,7 +55,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body: ; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ARG]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ARG]], i64 [[TMP0]]
@ -63,24 +63,23 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block: ; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP6]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP5]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP6]], <4 x float> [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP5]], <4 x float> [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536
; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph: ; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TOP:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TOP:%.*]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop: ; CHECK: loop:
; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@ -93,7 +92,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
; CHECK-NEXT: [[T8:%.*]] = icmp eq i64 [[T7]], 65537 ; CHECK-NEXT: [[T8:%.*]] = icmp eq i64 [[T7]], 65537
; CHECK-NEXT: br i1 [[T8]], label [[OUT]], label [[LOOP]], !llvm.loop !2 ; CHECK-NEXT: br i1 [[T8]], label [[OUT]], label [[LOOP]], !llvm.loop !2
; CHECK: out: ; CHECK: out:
; CHECK-NEXT: [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[T6_LCSSA]] ; CHECK-NEXT: ret float [[T6_LCSSA]]
; ;
top: top: