forked from OSchip/llvm-project
[ScalarizeMaskedMemIntrin] When expanding masked loads, start with the passthru value and insert each conditional load result over their element.
Previously we started with undef and did one final merge at the end with a select. llvm-svn: 343271
This commit is contained in:
parent
dfc0f289fa
commit
7d234d6628
|
@ -90,7 +90,7 @@ FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
|
|||
// cond.load: ; preds = %0
|
||||
// %3 = getelementptr i32* %1, i32 0
|
||||
// %4 = load i32* %3
|
||||
// %5 = insertelement <16 x i32> undef, i32 %4, i32 0
|
||||
// %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0
|
||||
// br label %else
|
||||
//
|
||||
// else: ; preds = %0, %cond.load
|
||||
|
@ -146,10 +146,8 @@ static void scalarizeMaskedLoad(CallInst *CI) {
|
|||
Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
|
||||
unsigned VectorWidth = VecType->getNumElements();
|
||||
|
||||
Value *UndefVal = UndefValue::get(VecType);
|
||||
|
||||
// The result vector
|
||||
Value *VResult = UndefVal;
|
||||
Value *VResult = Src0;
|
||||
|
||||
if (isa<Constant>(Mask)) {
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
|
@ -161,15 +159,11 @@ static void scalarizeMaskedLoad(CallInst *CI) {
|
|||
VResult =
|
||||
Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
|
||||
}
|
||||
Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
|
||||
CI->replaceAllUsesWith(NewI);
|
||||
CI->replaceAllUsesWith(VResult);
|
||||
CI->eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
PHINode *Phi = nullptr;
|
||||
Value *PrevPhi = UndefVal;
|
||||
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
// Fill the "else" block, created in the previous iteration
|
||||
//
|
||||
|
@ -177,13 +171,6 @@ static void scalarizeMaskedLoad(CallInst *CI) {
|
|||
// %mask_1 = extractelement <16 x i1> %mask, i32 Idx
|
||||
// br i1 %mask_1, label %cond.load, label %else
|
||||
//
|
||||
if (Idx > 0) {
|
||||
Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
|
||||
Phi->addIncoming(VResult, CondBlock);
|
||||
Phi->addIncoming(PrevPhi, PrevIfBlock);
|
||||
PrevPhi = Phi;
|
||||
VResult = Phi;
|
||||
}
|
||||
|
||||
Value *Predicate =
|
||||
Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
|
||||
|
@ -200,7 +187,8 @@ static void scalarizeMaskedLoad(CallInst *CI) {
|
|||
Value *Gep =
|
||||
Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
|
||||
LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
|
||||
VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
|
||||
Value *NewVResult = Builder.CreateInsertElement(VResult, Load,
|
||||
Builder.getInt32(Idx));
|
||||
|
||||
// Create "else" block, fill it in the next iteration
|
||||
BasicBlock *NewIfBlock =
|
||||
|
@ -211,13 +199,15 @@ static void scalarizeMaskedLoad(CallInst *CI) {
|
|||
OldBr->eraseFromParent();
|
||||
PrevIfBlock = IfBlock;
|
||||
IfBlock = NewIfBlock;
|
||||
|
||||
// Create the phi to join the new and previous value.
|
||||
PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
|
||||
Phi->addIncoming(NewVResult, CondBlock);
|
||||
Phi->addIncoming(VResult, PrevIfBlock);
|
||||
VResult = Phi;
|
||||
}
|
||||
|
||||
Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
|
||||
Phi->addIncoming(VResult, CondBlock);
|
||||
Phi->addIncoming(PrevPhi, PrevIfBlock);
|
||||
Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
|
||||
CI->replaceAllUsesWith(NewI);
|
||||
CI->replaceAllUsesWith(VResult);
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
|
|
|
@ -12,50 +12,20 @@ define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double
|
|||
; AVX-LABEL: loadv1:
|
||||
; AVX: ## %bb.0:
|
||||
; AVX-NEXT: testq %rdi, %rdi
|
||||
; AVX-NEXT: ## implicit-def: $xmm1
|
||||
; AVX-NEXT: je LBB0_1
|
||||
; AVX-NEXT: ## %bb.2: ## %else
|
||||
; AVX-NEXT: testq %rdi, %rdi
|
||||
; AVX-NEXT: jne LBB0_3
|
||||
; AVX-NEXT: LBB0_4: ## %else
|
||||
; AVX-NEXT: vmovaps %xmm1, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
; AVX-NEXT: LBB0_1: ## %cond.load
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; AVX-NEXT: testq %rdi, %rdi
|
||||
; AVX-NEXT: je LBB0_4
|
||||
; AVX-NEXT: LBB0_3: ## %else
|
||||
; AVX-NEXT: vmovaps %xmm0, %xmm1
|
||||
; AVX-NEXT: vmovaps %xmm1, %xmm0
|
||||
; AVX-NEXT: jne LBB0_2
|
||||
; AVX-NEXT: ## %bb.1: ## %cond.load
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: LBB0_2: ## %else
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: loadv1:
|
||||
; AVX512F: ## %bb.0:
|
||||
; AVX512F-NEXT: testq %rdi, %rdi
|
||||
; AVX512F-NEXT: ## implicit-def: $xmm1
|
||||
; AVX512F-NEXT: jne LBB0_2
|
||||
; AVX512F-NEXT: ## %bb.1: ## %cond.load
|
||||
; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; AVX512F-NEXT: LBB0_2: ## %else
|
||||
; AVX512F-NEXT: testq %rdi, %rdi
|
||||
; AVX512F-NEXT: sete %al
|
||||
; AVX512F-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; SKX-LABEL: loadv1:
|
||||
; SKX: ## %bb.0:
|
||||
; SKX-NEXT: testq %rdi, %rdi
|
||||
; SKX-NEXT: ## implicit-def: $xmm1
|
||||
; SKX-NEXT: jne LBB0_2
|
||||
; SKX-NEXT: ## %bb.1: ## %cond.load
|
||||
; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; SKX-NEXT: LBB0_2: ## %else
|
||||
; SKX-NEXT: testq %rdi, %rdi
|
||||
; SKX-NEXT: sete %al
|
||||
; SKX-NEXT: kmovd %eax, %k1
|
||||
; SKX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
|
||||
; SKX-NEXT: retq
|
||||
; AVX512-LABEL: loadv1:
|
||||
; AVX512: ## %bb.0:
|
||||
; AVX512-NEXT: testq %rdi, %rdi
|
||||
; AVX512-NEXT: jne LBB0_2
|
||||
; AVX512-NEXT: ## %bb.1: ## %cond.load
|
||||
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX512-NEXT: LBB0_2: ## %else
|
||||
; AVX512-NEXT: retq
|
||||
%mask = icmp eq <1 x i64> %trigger, zeroinitializer
|
||||
%res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
|
||||
ret <1 x double> %res
|
||||
|
|
|
@ -9,10 +9,10 @@ define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %pass
|
|||
; CHECK: cond.load:
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP4]], i32 0
|
||||
; CHECK-NEXT: br label [[ELSE]]
|
||||
; CHECK: else:
|
||||
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ undef, [[TMP0:%.*]] ]
|
||||
; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[MASK]], i32 1
|
||||
; CHECK-NEXT: br i1 [[TMP6]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
|
||||
; CHECK: cond.load1:
|
||||
|
@ -21,9 +21,8 @@ define <2 x i64> @scalarize_v2i64(<2 x i64>* %p, <2 x i1> %mask, <2 x i64> %pass
|
|||
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP8]], i32 1
|
||||
; CHECK-NEXT: br label [[ELSE2]]
|
||||
; CHECK: else2:
|
||||
; CHECK-NEXT: [[RES_PHI_SELECT:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[MASK]], <2 x i64> [[RES_PHI_SELECT]], <2 x i64> [[PASSTHRU:%.*]]
|
||||
; CHECK-NEXT: ret <2 x i64> [[TMP10]]
|
||||
; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP9]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
|
||||
; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
|
||||
;
|
||||
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> %mask, <2 x i64> %passthru)
|
||||
ret <2 x i64> %ret
|
||||
|
@ -41,8 +40,7 @@ define <2 x i64> @scalarize_v2i64_ones_mask(<2 x i64>* %p, <2 x i64> %passthru)
|
|||
define <2 x i64> @scalarize_v2i64_zero_mask(<2 x i64>* %p, <2 x i64> %passthru) {
|
||||
; CHECK-LABEL: @scalarize_v2i64_zero_mask(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> zeroinitializer, <2 x i64> undef, <2 x i64> [[PASSTHRU:%.*]]
|
||||
; CHECK-NEXT: ret <2 x i64> [[TMP2]]
|
||||
; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
|
||||
;
|
||||
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
|
||||
ret <2 x i64> %ret
|
||||
|
@ -53,9 +51,8 @@ define <2 x i64> @scalarize_v2i64_const_mask(<2 x i64>* %p, <2 x i64> %passthru)
|
|||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64>* [[P:%.*]] to i64*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> <i1 false, i1 true>, <2 x i64> [[TMP4]], <2 x i64> [[PASSTHRU:%.*]]
|
||||
; CHECK-NEXT: ret <2 x i64> [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP3]], i32 1
|
||||
; CHECK-NEXT: ret <2 x i64> [[TMP4]]
|
||||
;
|
||||
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
|
||||
ret <2 x i64> %ret
|
||||
|
|
Loading…
Reference in New Issue