diff --git a/llvm/include/llvm/Transforms/Utils/VectorUtils.h b/llvm/include/llvm/Transforms/Utils/VectorUtils.h index e1d6c562923b..44a7149eee98 100644 --- a/llvm/include/llvm/Transforms/Utils/VectorUtils.h +++ b/llvm/include/llvm/Transforms/Utils/VectorUtils.h @@ -48,12 +48,27 @@ static inline bool isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::pow: case Intrinsic::fma: case Intrinsic::fmuladd: + case Intrinsic::ctlz: + case Intrinsic::cttz: + case Intrinsic::powi: return true; default: return false; } } +static bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, + unsigned ScalarOpdIdx) { + switch (ID) { + case Intrinsic::ctlz: + case Intrinsic::cttz: + case Intrinsic::powi: + return (ScalarOpdIdx == 1); + default: + return false; + } +} + static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I, Intrinsic::ID ValidIntrinsicID) { if (I.getNumArgOperands() != 1 || diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ba2b7eea363f..15d4c1c79d2b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3123,9 +3123,14 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { scalarizeInstruction(it); break; default: + bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (HasScalarOpd && i == 1) { + Args.push_back(CI->getArgOperand(i)); + continue; + } VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } @@ -3474,6 +3479,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return false; } + // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the + // second argument is the same (i.e. loop invariant) + if (CI && + hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { + if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { + DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + return false; + } + } + // Check that the instruction return type is vectorizable. // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e13ba956c398..ce0a009e1cb1 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -961,9 +961,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } - Function *Int = CI->getCalledFunction(); - + Value *A1I = nullptr; + if (hasVectorInstrinsicScalarOpd(ID, 1)) + A1I = CI->getArgOperand(1); for (unsigned i = 1, e = VL.size(); i != e; ++i) { CallInst *CI2 = dyn_cast(VL[i]); if (!CI2 || CI2->getCalledFunction() != Int || @@ -973,6 +974,18 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { << "\n"); return; } + // ctlz,cttz and powi are special intrinsics whose second argument + // should be same in order for them to be vectorized. + if (hasVectorInstrinsicScalarOpd(ID, 1)) { + Value *A1J = CI2->getArgOperand(1); + if (A1I != A1J) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI + << " argument "<< A1I<<"!=" << A1J + << "\n"); + return; + } + } } newTreeEntry(VL, true); @@ -1652,9 +1665,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Call: { CallInst *CI = cast(VL0); setInsertPointAfterBundle(E->Scalars); + Function *FI; + Intrinsic::ID IID = Intrinsic::not_intrinsic; + if (CI && (FI = CI->getCalledFunction())) { + IID = (Intrinsic::ID) FI->getIntrinsicID(); + } std::vector OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; + // ctlz,cttz and powi are special intrinsics whose second argument is + // a scalar. This argument should not be vectorized. + if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { + CallInst *CEI = cast(E->Scalars[0]); + OpVecs.push_back(CEI->getArgOperand(j)); + continue; + } for (int i = 0, e = E->Scalars.size(); i < e; ++i) { CallInst *CEI = cast(E->Scalars[i]); OpVL.push_back(CEI->getArgOperand(j)); diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll index c3d570c03a77..7dfaf03b0f2d 100644 --- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll +++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll @@ -1090,3 +1090,105 @@ for.end: ; preds = %for.body ret void } +declare double @llvm.powi.f64(double %Val, i32 %power) nounwind readnone + +;CHECK-LABEL: @powi_f64( +;CHECK: llvm.powi.v4f64 +;CHECK: ret void +define void @powi_f64(i32 %n, double* noalias %y, double* noalias %x, i32 %P) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv + %0 = load double* %arrayidx, align 8 + %call = tail call double @llvm.powi.f64(double %0, i32 %P) nounwind readnone + %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv + store double %call, double* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;CHECK-LABEL: @powi_f64_neg( +;CHECK-NOT: llvm.powi.v4f64 +;CHECK: ret void +define void @powi_f64_neg(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv + %0 = load double* %arrayidx, align 8 + %1 = trunc i64 %indvars.iv to i32 + %call = tail call double @llvm.powi.f64(double %0, i32 %1) nounwind readnone + %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv + store double %call, double* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare i64 @llvm.cttz.i64 (i64, i1) nounwind readnone + +;CHECK-LABEL: @cttz_f64( +;CHECK: llvm.cttz.v4i64 +;CHECK: ret void +define void @cttz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv + %0 = load i64* %arrayidx, align 8 + %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone + %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv + store i64 %call, i64* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare i64 @llvm.ctlz.i64 (i64, i1) nounwind readnone + +;CHECK-LABEL: @ctlz_f64( +;CHECK: llvm.ctlz.v4i64 +;CHECK: ret void +define void @ctlz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv + %0 = load i64* %arrayidx, align 8 + %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone + %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv + store i64 %call, i64* %arrayidx4, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll index 30c509369730..937252f4146b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll @@ -117,3 +117,270 @@ entry: ; CHECK: store <4 x i32> ; CHECK: ret } + +declare i32 @llvm.ctlz.i32(i32,i1) nounwind readnone + +define void @vec_ctlz_i32(i32* %a, i32* %b, i32* %c, i1) { +entry: + %i0 = load i32* %a, align 4 + %i1 = load i32* %b, align 4 + %add1 = add i32 %i0, %i1 + %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone + + %arrayidx2 = getelementptr inbounds i32* %a, i32 1 + %i2 = load i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i32 1 + %i3 = load i32* %arrayidx3, align 4 + %add2 = add i32 %i2, %i3 + %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 true) nounwind readnone + + %arrayidx4 = getelementptr inbounds i32* %a, i32 2 + %i4 = load i32* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32* %b, i32 2 + %i5 = load i32* %arrayidx5, align 4 + %add3 = add i32 %i4, %i5 + %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone + + %arrayidx6 = getelementptr inbounds i32* %a, i32 3 + %i6 = load i32* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32* %b, i32 3 + %i7 = load i32* %arrayidx7, align 4 + %add4 = add i32 %i6, %i7 + %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 true) nounwind readnone + + store i32 %call1, i32* %c, align 4 + %arrayidx8 = getelementptr inbounds i32* %c, i32 1 + store i32 %call2, i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i32 2 + store i32 %call3, i32* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds i32* %c, i32 3 + store i32 %call4, i32* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @vec_ctlz_i32( +; CHECK: load <4 x i32> +; CHECK: load <4 x i32> +; CHECK: call <4 x i32> @llvm.ctlz.v4i32 +; CHECK: store <4 x i32> +; CHECK: ret +} + +define void @vec_ctlz_i32_neg(i32* %a, i32* %b, i32* %c, i1) { +entry: + %i0 = load i32* %a, align 4 + %i1 = load i32* %b, align 4 + %add1 = add i32 %i0, %i1 + %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone + + %arrayidx2 = getelementptr inbounds i32* %a, i32 1 + %i2 = load i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i32 1 + %i3 = load i32* %arrayidx3, align 4 + %add2 = add i32 %i2, %i3 + %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 false) nounwind readnone + + %arrayidx4 = getelementptr inbounds i32* %a, i32 2 + %i4 = load i32* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32* %b, i32 2 + %i5 = load i32* %arrayidx5, align 4 + %add3 = add i32 %i4, %i5 + %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone + + %arrayidx6 = getelementptr inbounds i32* %a, i32 3 + %i6 = load i32* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32* %b, i32 3 + %i7 = load i32* %arrayidx7, align 4 + %add4 = add i32 %i6, %i7 + %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 false) nounwind readnone + + store i32 %call1, i32* %c, align 4 + %arrayidx8 = getelementptr inbounds i32* %c, i32 1 + store i32 %call2, i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i32 2 + store i32 %call3, i32* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds i32* %c, i32 3 + store i32 %call4, i32* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @vec_ctlz_i32_neg( +; CHECK-NOT: call <4 x i32> @llvm.ctlz.v4i32 + +} + + +declare i32 @llvm.cttz.i32(i32,i1) nounwind readnone + +define void @vec_cttz_i32(i32* %a, i32* %b, i32* %c, i1) { +entry: + %i0 = load i32* %a, align 4 + %i1 = load i32* %b, align 4 + %add1 = add i32 %i0, %i1 + %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone + + %arrayidx2 = getelementptr inbounds i32* %a, i32 1 + %i2 = load i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i32 1 + %i3 = load i32* %arrayidx3, align 4 + %add2 = add i32 %i2, %i3 + %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 true) nounwind readnone + + %arrayidx4 = getelementptr inbounds i32* %a, i32 2 + %i4 = load i32* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32* %b, i32 2 + %i5 = load i32* %arrayidx5, align 4 + %add3 = add i32 %i4, %i5 + %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone + + %arrayidx6 = getelementptr inbounds i32* %a, i32 3 + %i6 = load i32* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32* %b, i32 3 + %i7 = load i32* %arrayidx7, align 4 + %add4 = add i32 %i6, %i7 + %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 true) nounwind readnone + + store i32 %call1, i32* %c, align 4 + %arrayidx8 = getelementptr inbounds i32* %c, i32 1 + store i32 %call2, i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i32 2 + store i32 %call3, i32* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds i32* %c, i32 3 + store i32 %call4, i32* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @vec_cttz_i32( +; CHECK: load <4 x i32> +; CHECK: load <4 x i32> +; CHECK: call <4 x i32> @llvm.cttz.v4i32 +; CHECK: store <4 x i32> +; CHECK: ret +} + +define void @vec_cttz_i32_neg(i32* %a, i32* %b, i32* %c, i1) { +entry: + %i0 = load i32* %a, align 4 + %i1 = load i32* %b, align 4 + %add1 = add i32 %i0, %i1 + %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone + + %arrayidx2 = getelementptr inbounds i32* %a, i32 1 + %i2 = load i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i32 1 + %i3 = load i32* %arrayidx3, align 4 + %add2 = add i32 %i2, %i3 + %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 false) nounwind readnone + + %arrayidx4 = getelementptr inbounds i32* %a, i32 2 + %i4 = load i32* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32* %b, i32 2 + %i5 = load i32* %arrayidx5, align 4 + %add3 = add i32 %i4, %i5 + %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone + + %arrayidx6 = getelementptr inbounds i32* %a, i32 3 + %i6 = load i32* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32* %b, i32 3 + %i7 = load i32* %arrayidx7, align 4 + %add4 = add i32 %i6, %i7 + %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 false) nounwind readnone + + store i32 %call1, i32* %c, align 4 + %arrayidx8 = getelementptr inbounds i32* %c, i32 1 + store i32 %call2, i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i32 2 + store i32 %call3, i32* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds i32* %c, i32 3 + store i32 %call4, i32* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @vec_cttz_i32_neg( +; CHECK-NOT: call <4 x i32> @llvm.cttz.v4i32 +} + + +declare float @llvm.powi.f32(float, i32) +define void @vec_powi_f32(float* %a, float* %b, float* %c, i32 %P) { +entry: + %i0 = load float* %a, align 4 + %i1 = load float* %b, align 4 + %add1 = fadd float %i0, %i1 + %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone + + %arrayidx2 = getelementptr inbounds float* %a, i32 1 + %i2 = load float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float* %b, i32 1 + %i3 = load float* %arrayidx3, align 4 + %add2 = fadd float %i2, %i3 + %call2 = tail call float @llvm.powi.f32(float %add2,i32 %P) nounwind readnone + + %arrayidx4 = getelementptr inbounds float* %a, i32 2 + %i4 = load float* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds float* %b, i32 2 + %i5 = load float* %arrayidx5, align 4 + %add3 = fadd float %i4, %i5 + %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone + + %arrayidx6 = getelementptr inbounds float* %a, i32 3 + %i6 = load float* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds float* %b, i32 3 + %i7 = load float* %arrayidx7, align 4 + %add4 = fadd float %i6, %i7 + %call4 = tail call float @llvm.powi.f32(float %add4,i32 %P) nounwind readnone + + store float %call1, float* %c, align 4 + %arrayidx8 = getelementptr inbounds float* %c, i32 1 + store float %call2, float* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds float* %c, i32 2 + store float %call3, float* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds float* %c, i32 3 + store float %call4, float* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @vec_powi_f32( +; CHECK: load <4 x float> +; CHECK: load <4 x float> +; CHECK: call <4 x float> @llvm.powi.v4f32 +; CHECK: store <4 x float> +; CHECK: ret +} + + +define void @vec_powi_f32_neg(float* %a, float* %b, float* %c, i32 %P, i32 %Q) { +entry: + %i0 = load float* %a, align 4 + %i1 = load float* %b, align 4 + %add1 = fadd float %i0, %i1 + %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone + + %arrayidx2 = getelementptr inbounds float* %a, i32 1 + %i2 = load float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float* %b, i32 1 + %i3 = load float* %arrayidx3, align 4 + %add2 = fadd float %i2, %i3 + %call2 = tail call float @llvm.powi.f32(float %add2,i32 %Q) nounwind readnone + + %arrayidx4 = getelementptr inbounds float* %a, i32 2 + %i4 = load float* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds float* %b, i32 2 + %i5 = load float* %arrayidx5, align 4 + %add3 = fadd float %i4, %i5 + %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone + + %arrayidx6 = getelementptr inbounds float* %a, i32 3 + %i6 = load float* %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds float* %b, i32 3 + %i7 = load float* %arrayidx7, align 4 + %add4 = fadd float %i6, %i7 + %call4 = tail call float @llvm.powi.f32(float %add4,i32 %Q) nounwind readnone + + store float %call1, float* %c, align 4 + %arrayidx8 = getelementptr inbounds float* %c, i32 1 + store float %call2, float* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds float* %c, i32 2 + store float %call3, float* %arrayidx9, align 4 + %arrayidx10 = getelementptr inbounds float* %c, i32 3 + store float %call4, float* %arrayidx10, align 4 + ret void + +; CHECK-LABEL: @vec_powi_f32_neg( +; CHECK-NOT: call <4 x float> @llvm.powi.v4f32 +}