[ScalarizeMaskedMemIntrin] When expanding masked loads, start with the passthru value and insert each conditional load result over their element.

Previously we started with undef and did one final merge at the end with a select.

llvm-svn: 343271
This commit is contained in:
Craig Topper 2018-09-27 21:28:52 +00:00
parent dfc0f289fa
commit 7d234d6628
3 changed files with 31 additions and 74 deletions

View File

@ -90,7 +90,7 @@ FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
// cond.load: ; preds = %0
// %3 = getelementptr i32* %1, i32 0
// %4 = load i32* %3
// %5 = insertelement <16 x i32> undef, i32 %4, i32 0
// %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0
// br label %else
//
// else: ; preds = %0, %cond.load
@ -146,10 +146,8 @@ static void scalarizeMaskedLoad(CallInst *CI) {
Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
unsigned VectorWidth = VecType->getNumElements();
Value *UndefVal = UndefValue::get(VecType);
// The result vector
Value *VResult = UndefVal;
Value *VResult = Src0;
if (isa<Constant>(Mask)) {
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
@ -161,15 +159,11 @@ static void scalarizeMaskedLoad(CallInst *CI) {
VResult =
Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
}
Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
CI->replaceAllUsesWith(NewI);
CI->replaceAllUsesWith(VResult);
CI->eraseFromParent();
return;
}
PHINode *Phi = nullptr;
Value *PrevPhi = UndefVal;
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
// Fill the "else" block, created in the previous iteration
//
@ -177,13 +171,6 @@ static void scalarizeMaskedLoad(CallInst *CI) {
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
// br i1 %mask_1, label %cond.load, label %else
//
if (Idx > 0) {
Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
Phi->addIncoming(VResult, CondBlock);
Phi->addIncoming(PrevPhi, PrevIfBlock);
PrevPhi = Phi;
VResult = Phi;
}
Value *Predicate =
Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
@ -200,7 +187,8 @@ static void scalarizeMaskedLoad(CallInst *CI) {
Value *Gep =
Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
Value *NewVResult = Builder.CreateInsertElement(VResult, Load,
Builder.getInt32(Idx));
// Create "else" block, fill it in the next iteration
BasicBlock *NewIfBlock =
@ -211,13 +199,15 @@ static void scalarizeMaskedLoad(CallInst *CI) {
OldBr->eraseFromParent();
PrevIfBlock = IfBlock;
IfBlock = NewIfBlock;
// Create the phi to join the new and previous value.
PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
Phi->addIncoming(NewVResult, CondBlock);
Phi->addIncoming(VResult, PrevIfBlock);
VResult = Phi;
}
Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
Phi->addIncoming(VResult, CondBlock);
Phi->addIncoming(PrevPhi, PrevIfBlock);
Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
CI->replaceAllUsesWith(NewI);
CI->replaceAllUsesWith(VResult);
CI->eraseFromParent();
}

View File

@ -12,50 +12,20 @@ define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double
; AVX-LABEL: loadv1:
; AVX: ## %bb.0:
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: ## implicit-def: $xmm1
; AVX-NEXT: je LBB0_1
; AVX-NEXT: ## %bb.2: ## %else
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: jne LBB0_3
; AVX-NEXT: LBB0_4: ## %else
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
; AVX-NEXT: LBB0_1: ## %cond.load
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: testq %rdi, %rdi
; AVX-NEXT: je LBB0_4
; AVX-NEXT: LBB0_3: ## %else
; AVX-NEXT: vmovaps %xmm0, %xmm1
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: jne LBB0_2
; AVX-NEXT: ## %bb.1: ## %cond.load
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: LBB0_2: ## %else
; AVX-NEXT: retq
;
; AVX512F-LABEL: loadv1:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: testq %rdi, %rdi
; AVX512F-NEXT: ## implicit-def: $xmm1
; AVX512F-NEXT: jne LBB0_2
; AVX512F-NEXT: ## %bb.1: ## %cond.load
; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: LBB0_2: ## %else
; AVX512F-NEXT: testq %rdi, %rdi
; AVX512F-NEXT: sete %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512F-NEXT: retq
;
; SKX-LABEL: loadv1:
; SKX: ## %bb.0:
; SKX-NEXT: testq %rdi, %rdi
; SKX-NEXT: ## implicit-def: $xmm1
; SKX-NEXT: jne LBB0_2
; SKX-NEXT: ## %bb.1: ## %cond.load
; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; SKX-NEXT: LBB0_2: ## %else
; SKX-NEXT: testq %rdi, %rdi
; SKX-NEXT: sete %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; SKX-NEXT: retq
; AVX512-LABEL: loadv1:
; AVX512: ## %bb.0:
; AVX512-NEXT: testq %rdi, %rdi
; AVX512-NEXT: jne LBB0_2
; AVX512-NEXT: ## %bb.1: ## %cond.load
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512-NEXT: LBB0_2: ## %else
; AVX512-NEXT: retq
%mask = icmp eq <1 x i64> %trigger, zeroinitializer
%res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
ret <1 x double> %res

View File

@ -9,10 +9,10 @@ define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %pass
; CHECK: cond.load:
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP4]], i32 0
; CHECK-NEXT: br label [[ELSE]]
; CHECK: else:
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ undef, [[TMP0:%.*]] ]
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i32 1
; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK: cond.load1:
@ -21,9 +21,8 @@ define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %pass
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP8]], i32 1
; CHECK-NEXT: br label [[ELSE2]]
; CHECK: else2:
; CHECK-NEXT: [[RES_PHI_SELECT:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[MASK]], <2 x i64> [[RES_PHI_SELECT]], <2 x i64> [[PASSTHRU:%.*]]
; CHECK-NEXT: ret <2 x i64> [[TMP10]]
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> %mask, <2 x i64> %passthru)
ret <2 x i64> %ret
@ -41,8 +40,7 @@ define <2 x i64> @scalarize_v2i64_ones_mask(<2 x i64>* %p, <2 x i64> %passthru)
define <2 x i64> @scalarize_v2i64_zero_mask(<2 x i64>* %p, <2 x i64> %passthru) {
; CHECK-LABEL: @scalarize_v2i64_zero_mask(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> zeroinitializer, <2 x i64> undef, <2 x i64> [[PASSTHRU:%.*]]
; CHECK-NEXT: ret <2 x i64> [[TMP2]]
; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
ret <2 x i64> %ret
@ -53,9 +51,8 @@ define <2 x i64> @scalarize_v2i64_const_mask(<2 x i64>* %p, <2 x i64> %passthru)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> <i1 false, i1 true>, <2 x i64> [[TMP4]], <2 x i64> [[PASSTHRU:%.*]]
; CHECK-NEXT: ret <2 x i64> [[TMP5]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP3]], i32 1
; CHECK-NEXT: ret <2 x i64> [[TMP4]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
ret <2 x i64> %ret