[LoopVectorize] Fix cost for calls to functions that have vector versions

A recent commit
(https://reviews.llvm.org/rG66c120f02560ef528a60924104ead66f330190f1) changed
the cost for calls to functions that have a vector version for some
vectorization factor. However, no check is performed for whether the
vectorization factor matches the current one being cost modeled. This leads to
attempts to widen call instructions to a vectorization factor for which such a
function does not exist, which in turn leads to an assertion failure.

This patch adds the check for vectorization factor (i.e. not just that the
called function has a vector version for some VF, but that it has a vector
version for this VF).

Differential revision: https://reviews.llvm.org/D74944
This commit is contained in:
Nemanja Ivanovic 2020-02-26 17:13:14 -06:00
parent 79493e721a
commit c46b85aaf4
3 changed files with 123 additions and 1 deletions

View File

@ -3277,7 +3277,10 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
NeedToScalarize = true;
if (!TLI || CI->isNoBuiltin() || VFDatabase::getMappings(*CI).empty())
VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
if (!TLI || CI->isNoBuiltin() || !VecFunc)
return Cost;
// If the corresponding vector cost is cheaper, return its cost.

View File

@ -0,0 +1,59 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -vector-library=MASSV -force-vector-interleave=1 \
; RUN: -vectorizer-maximize-bandwidth -O2 -loop-vectorize \
; RUN: -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s
define dso_local double @test(float* %Arr) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[ARR:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <2 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x double> @__sind2_massv(<2 x double> [[TMP3]])
; CHECK-NEXT: [[TMP5]] = fadd fast <2 x double> [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP5]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[DOTLCSSA]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[DOTLCSSA]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
; CHECK-NEXT: ret double [[TMP7]]
;
entry:
br label %for.cond
for.cond:
%Sum.0 = phi double [ 0.000000e+00, %entry ], [ %add, %for.inc ]
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
%cmp = icmp slt i32 %i.0, 128
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
br label %for.end
for.body:
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %Arr, i64 %idxprom
%0 = load float, float* %arrayidx, align 4
%conv = fpext float %0 to double
%1 = call fast double @llvm.sin.f64(double %conv)
%add = fadd fast double %Sum.0, %1
br label %for.inc
for.inc:
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end:
ret double %Sum.0
}
declare double @llvm.sin.f64(double)

View File

@ -0,0 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vectorizer-maximize-bandwidth -mtriple=powerpc64le-- -S \
; RUN: -targetlibinfo -loop-simplify -loop-rotate -loop-vectorize \
; RUN: -instcombine -simplifycfg -force-vector-interleave=1 < %s | FileCheck %s
define dso_local double @test(float* %Arr) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[ARR:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <2 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x double> @__sind2_massv(<2 x double> [[TMP3]])
; CHECK-NEXT: [[TMP5]] = fadd fast <2 x double> [[VEC_PHI]], [[TMP4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP5]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
; CHECK-NEXT: ret double [[TMP7]]
;
entry:
br label %for.cond
for.cond:
%Sum.0 = phi double [ 0.000000e+00, %entry ], [ %add, %for.inc ]
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
%cmp = icmp slt i32 %i.0, 128
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup:
br label %for.end
for.body:
%idxprom = sext i32 %i.0 to i64
%arrayidx = getelementptr inbounds float, float* %Arr, i64 %idxprom
%0 = load float, float* %arrayidx, align 4
%conv = fpext float %0 to double
%1 = call fast double @llvm.sin.f64(double %conv) #1
%add = fadd fast double %Sum.0, %1
br label %for.inc
for.inc:
%inc = add nsw i32 %i.0, 1
br label %for.cond
for.end:
ret double %Sum.0
}
declare double @llvm.sin.f64(double) #0
declare <2 x double> @__sind2_massv(<2 x double>) #0
attributes #0 = { nounwind readnone speculatable willreturn }
attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(__sind2_massv)" }