Allow vectorization of intrinsics such as powi,cttz and ctlz in Loop and SLP Vectorizer.

This patch adds support to vectorize intrinsics such as powi, cttz and ctlz in Vectorizer. These intrinsics are different from other
intrinsics as second argument to these function must be same in order to vectorize them and it should be represented as a scalar.
Review: http://reviews.llvm.org/D3851#inline-32769 and http://reviews.llvm.org/D3937#inline-32857

llvm-svn: 209873
This commit is contained in:
Karthik Bhat 2014-05-30 04:31:24 +00:00
parent 6cd3ebb223
commit 5ab7795649
5 changed files with 426 additions and 2 deletions

View File

@ -48,12 +48,27 @@ static inline bool isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::pow:
case Intrinsic::fma:
case Intrinsic::fmuladd:
case Intrinsic::ctlz:
case Intrinsic::cttz:
case Intrinsic::powi:
return true;
default:
return false;
}
}
static bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
unsigned ScalarOpdIdx) {
switch (ID) {
case Intrinsic::ctlz:
case Intrinsic::cttz:
case Intrinsic::powi:
return (ScalarOpdIdx == 1);
default:
return false;
}
}
static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
Intrinsic::ID ValidIntrinsicID) {
if (I.getNumArgOperands() != 1 ||

View File

@ -3123,9 +3123,14 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
scalarizeInstruction(it);
break;
default:
bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
if (HasScalarOpd && i == 1) {
Args.push_back(CI->getArgOperand(i));
continue;
}
VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
Args.push_back(Arg[Part]);
}
@ -3474,6 +3479,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return false;
}
// Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
// second argument is the same (i.e. loop invariant)
if (CI &&
hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
return false;
}
}
// Check that the instruction return type is vectorizable.
// Also, we can't vectorize extractelement instructions.
if ((!VectorType::isValidElementType(it->getType()) &&

View File

@ -961,9 +961,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
Function *Int = CI->getCalledFunction();
Value *A1I = nullptr;
if (hasVectorInstrinsicScalarOpd(ID, 1))
A1I = CI->getArgOperand(1);
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
@ -973,6 +974,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
<< "\n");
return;
}
// ctlz,cttz and powi are special intrinsics whose second argument
// should be same in order for them to be vectorized.
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J
<< "\n");
return;
}
}
}
newTreeEntry(VL, true);
@ -1652,9 +1665,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
setInsertPointAfterBundle(E->Scalars);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (CI && (FI = CI->getCalledFunction())) {
IID = (Intrinsic::ID) FI->getIntrinsicID();
}
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// ctlz,cttz and powi are special intrinsics whose second argument is
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
CallInst *CEI = cast<CallInst>(E->Scalars[i]);
OpVL.push_back(CEI->getArgOperand(j));

View File

@ -1090,3 +1090,105 @@ for.end: ; preds = %for.body
ret void
}
declare double @llvm.powi.f64(double %Val, i32 %power) nounwind readnone
;CHECK-LABEL: @powi_f64(
;CHECK: llvm.powi.v4f64
;CHECK: ret void
define void @powi_f64(i32 %n, double* noalias %y, double* noalias %x, i32 %P) nounwind uwtable {
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
%0 = load double* %arrayidx, align 8
%call = tail call double @llvm.powi.f64(double %0, i32 %P) nounwind readnone
%arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv
store double %call, double* %arrayidx4, align 8
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}
;CHECK-LABEL: @powi_f64_neg(
;CHECK-NOT: llvm.powi.v4f64
;CHECK: ret void
define void @powi_f64_neg(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
%0 = load double* %arrayidx, align 8
%1 = trunc i64 %indvars.iv to i32
%call = tail call double @llvm.powi.f64(double %0, i32 %1) nounwind readnone
%arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv
store double %call, double* %arrayidx4, align 8
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}
declare i64 @llvm.cttz.i64 (i64, i1) nounwind readnone
;CHECK-LABEL: @cttz_f64(
;CHECK: llvm.cttz.v4i64
;CHECK: ret void
define void @cttz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv
%0 = load i64* %arrayidx, align 8
%call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone
%arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv
store i64 %call, i64* %arrayidx4, align 8
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}
declare i64 @llvm.ctlz.i64 (i64, i1) nounwind readnone
;CHECK-LABEL: @ctlz_f64(
;CHECK: llvm.ctlz.v4i64
;CHECK: ret void
define void @ctlz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
entry:
%cmp9 = icmp sgt i32 %n, 0
br i1 %cmp9, label %for.body, label %for.end
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv
%0 = load i64* %arrayidx, align 8
%call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone
%arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv
store i64 %call, i64* %arrayidx4, align 8
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}

View File

@ -117,3 +117,270 @@ entry:
; CHECK: store <4 x i32>
; CHECK: ret
}
declare i32 @llvm.ctlz.i32(i32,i1) nounwind readnone
define void @vec_ctlz_i32(i32* %a, i32* %b, i32* %c, i1) {
entry:
%i0 = load i32* %a, align 4
%i1 = load i32* %b, align 4
%add1 = add i32 %i0, %i1
%call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
%arrayidx2 = getelementptr inbounds i32* %a, i32 1
%i2 = load i32* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds i32* %b, i32 1
%i3 = load i32* %arrayidx3, align 4
%add2 = add i32 %i2, %i3
%call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 true) nounwind readnone
%arrayidx4 = getelementptr inbounds i32* %a, i32 2
%i4 = load i32* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds i32* %b, i32 2
%i5 = load i32* %arrayidx5, align 4
%add3 = add i32 %i4, %i5
%call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
%arrayidx6 = getelementptr inbounds i32* %a, i32 3
%i6 = load i32* %arrayidx6, align 4
%arrayidx7 = getelementptr inbounds i32* %b, i32 3
%i7 = load i32* %arrayidx7, align 4
%add4 = add i32 %i6, %i7
%call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 true) nounwind readnone
store i32 %call1, i32* %c, align 4
%arrayidx8 = getelementptr inbounds i32* %c, i32 1
store i32 %call2, i32* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds i32* %c, i32 2
store i32 %call3, i32* %arrayidx9, align 4
%arrayidx10 = getelementptr inbounds i32* %c, i32 3
store i32 %call4, i32* %arrayidx10, align 4
ret void
; CHECK-LABEL: @vec_ctlz_i32(
; CHECK: load <4 x i32>
; CHECK: load <4 x i32>
; CHECK: call <4 x i32> @llvm.ctlz.v4i32
; CHECK: store <4 x i32>
; CHECK: ret
}
define void @vec_ctlz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
entry:
%i0 = load i32* %a, align 4
%i1 = load i32* %b, align 4
%add1 = add i32 %i0, %i1
%call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
%arrayidx2 = getelementptr inbounds i32* %a, i32 1
%i2 = load i32* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds i32* %b, i32 1
%i3 = load i32* %arrayidx3, align 4
%add2 = add i32 %i2, %i3
%call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 false) nounwind readnone
%arrayidx4 = getelementptr inbounds i32* %a, i32 2
%i4 = load i32* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds i32* %b, i32 2
%i5 = load i32* %arrayidx5, align 4
%add3 = add i32 %i4, %i5
%call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
%arrayidx6 = getelementptr inbounds i32* %a, i32 3
%i6 = load i32* %arrayidx6, align 4
%arrayidx7 = getelementptr inbounds i32* %b, i32 3
%i7 = load i32* %arrayidx7, align 4
%add4 = add i32 %i6, %i7
%call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 false) nounwind readnone
store i32 %call1, i32* %c, align 4
%arrayidx8 = getelementptr inbounds i32* %c, i32 1
store i32 %call2, i32* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds i32* %c, i32 2
store i32 %call3, i32* %arrayidx9, align 4
%arrayidx10 = getelementptr inbounds i32* %c, i32 3
store i32 %call4, i32* %arrayidx10, align 4
ret void
; CHECK-LABEL: @vec_ctlz_i32_neg(
; CHECK-NOT: call <4 x i32> @llvm.ctlz.v4i32
}
declare i32 @llvm.cttz.i32(i32,i1) nounwind readnone
define void @vec_cttz_i32(i32* %a, i32* %b, i32* %c, i1) {
entry:
%i0 = load i32* %a, align 4
%i1 = load i32* %b, align 4
%add1 = add i32 %i0, %i1
%call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
%arrayidx2 = getelementptr inbounds i32* %a, i32 1
%i2 = load i32* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds i32* %b, i32 1
%i3 = load i32* %arrayidx3, align 4
%add2 = add i32 %i2, %i3
%call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 true) nounwind readnone
%arrayidx4 = getelementptr inbounds i32* %a, i32 2
%i4 = load i32* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds i32* %b, i32 2
%i5 = load i32* %arrayidx5, align 4
%add3 = add i32 %i4, %i5
%call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
%arrayidx6 = getelementptr inbounds i32* %a, i32 3
%i6 = load i32* %arrayidx6, align 4
%arrayidx7 = getelementptr inbounds i32* %b, i32 3
%i7 = load i32* %arrayidx7, align 4
%add4 = add i32 %i6, %i7
%call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 true) nounwind readnone
store i32 %call1, i32* %c, align 4
%arrayidx8 = getelementptr inbounds i32* %c, i32 1
store i32 %call2, i32* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds i32* %c, i32 2
store i32 %call3, i32* %arrayidx9, align 4
%arrayidx10 = getelementptr inbounds i32* %c, i32 3
store i32 %call4, i32* %arrayidx10, align 4
ret void
; CHECK-LABEL: @vec_cttz_i32(
; CHECK: load <4 x i32>
; CHECK: load <4 x i32>
; CHECK: call <4 x i32> @llvm.cttz.v4i32
; CHECK: store <4 x i32>
; CHECK: ret
}
define void @vec_cttz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
entry:
%i0 = load i32* %a, align 4
%i1 = load i32* %b, align 4
%add1 = add i32 %i0, %i1
%call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
%arrayidx2 = getelementptr inbounds i32* %a, i32 1
%i2 = load i32* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds i32* %b, i32 1
%i3 = load i32* %arrayidx3, align 4
%add2 = add i32 %i2, %i3
%call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 false) nounwind readnone
%arrayidx4 = getelementptr inbounds i32* %a, i32 2
%i4 = load i32* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds i32* %b, i32 2
%i5 = load i32* %arrayidx5, align 4
%add3 = add i32 %i4, %i5
%call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
%arrayidx6 = getelementptr inbounds i32* %a, i32 3
%i6 = load i32* %arrayidx6, align 4
%arrayidx7 = getelementptr inbounds i32* %b, i32 3
%i7 = load i32* %arrayidx7, align 4
%add4 = add i32 %i6, %i7
%call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 false) nounwind readnone
store i32 %call1, i32* %c, align 4
%arrayidx8 = getelementptr inbounds i32* %c, i32 1
store i32 %call2, i32* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds i32* %c, i32 2
store i32 %call3, i32* %arrayidx9, align 4
%arrayidx10 = getelementptr inbounds i32* %c, i32 3
store i32 %call4, i32* %arrayidx10, align 4
ret void
; CHECK-LABEL: @vec_cttz_i32_neg(
; CHECK-NOT: call <4 x i32> @llvm.cttz.v4i32
}
declare float @llvm.powi.f32(float, i32)
define void @vec_powi_f32(float* %a, float* %b, float* %c, i32 %P) {
entry:
%i0 = load float* %a, align 4
%i1 = load float* %b, align 4
%add1 = fadd float %i0, %i1
%call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
%arrayidx2 = getelementptr inbounds float* %a, i32 1
%i2 = load float* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds float* %b, i32 1
%i3 = load float* %arrayidx3, align 4
%add2 = fadd float %i2, %i3
%call2 = tail call float @llvm.powi.f32(float %add2,i32 %P) nounwind readnone
%arrayidx4 = getelementptr inbounds float* %a, i32 2
%i4 = load float* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds float* %b, i32 2
%i5 = load float* %arrayidx5, align 4
%add3 = fadd float %i4, %i5
%call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
%arrayidx6 = getelementptr inbounds float* %a, i32 3
%i6 = load float* %arrayidx6, align 4
%arrayidx7 = getelementptr inbounds float* %b, i32 3
%i7 = load float* %arrayidx7, align 4
%add4 = fadd float %i6, %i7
%call4 = tail call float @llvm.powi.f32(float %add4,i32 %P) nounwind readnone
store float %call1, float* %c, align 4
%arrayidx8 = getelementptr inbounds float* %c, i32 1
store float %call2, float* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds float* %c, i32 2
store float %call3, float* %arrayidx9, align 4
%arrayidx10 = getelementptr inbounds float* %c, i32 3
store float %call4, float* %arrayidx10, align 4
ret void
; CHECK-LABEL: @vec_powi_f32(
; CHECK: load <4 x float>
; CHECK: load <4 x float>
; CHECK: call <4 x float> @llvm.powi.v4f32
; CHECK: store <4 x float>
; CHECK: ret
}
define void @vec_powi_f32_neg(float* %a, float* %b, float* %c, i32 %P, i32 %Q) {
entry:
%i0 = load float* %a, align 4
%i1 = load float* %b, align 4
%add1 = fadd float %i0, %i1
%call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
%arrayidx2 = getelementptr inbounds float* %a, i32 1
%i2 = load float* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds float* %b, i32 1
%i3 = load float* %arrayidx3, align 4
%add2 = fadd float %i2, %i3
%call2 = tail call float @llvm.powi.f32(float %add2,i32 %Q) nounwind readnone
%arrayidx4 = getelementptr inbounds float* %a, i32 2
%i4 = load float* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds float* %b, i32 2
%i5 = load float* %arrayidx5, align 4
%add3 = fadd float %i4, %i5
%call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
%arrayidx6 = getelementptr inbounds float* %a, i32 3
%i6 = load float* %arrayidx6, align 4
%arrayidx7 = getelementptr inbounds float* %b, i32 3
%i7 = load float* %arrayidx7, align 4
%add4 = fadd float %i6, %i7
%call4 = tail call float @llvm.powi.f32(float %add4,i32 %Q) nounwind readnone
store float %call1, float* %c, align 4
%arrayidx8 = getelementptr inbounds float* %c, i32 1
store float %call2, float* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds float* %c, i32 2
store float %call3, float* %arrayidx9, align 4
%arrayidx10 = getelementptr inbounds float* %c, i32 3
store float %call4, float* %arrayidx10, align 4
ret void
; CHECK-LABEL: @vec_powi_f32_neg(
; CHECK-NOT: call <4 x float> @llvm.powi.v4f32
}