[RISCV] Implement getVScaleForTuning and thus prefer scalable vectorization when enabled

LoopVectorizer uses getVScaleForTuning for deciding how to discount the cost of a potential vector factor by the amount of work performed. Without the callback implemented, the vectorizer was defaulting to an estimated vscale of 1. This results in fixed vectorization looking falsely profitable (since it used the command line VLEN).

The test change is pretty limited since a) we don't have much coverage of the vectorizer with scalable vectors at all, and b) what little coverage we have mostly uses i64 element types. There's a separate issue with <vscale x 1 x i64> which prevents us from getting to this stage of costing, and thus only the one test explicitly written to avoid that is visible in the diff. However, this is actually a very wide impact change as it changes the practical vectorization result when both fixed and scalable is enabled to scalable.

As an aside, I think the vectorizer is at little too strongly biased towards scalable when both are legal, but we can explore that separately. For now, let's just get the cost model working the way it was intended.

Differential Revision: https://reviews.llvm.org/D128547
This commit is contained in:
Philip Reames 2022-06-25 11:18:40 -07:00 committed by Philip Reames
parent d152e50c15
commit 9803b0d1e7
3 changed files with 47 additions and 22 deletions

View File

@ -137,6 +137,12 @@ Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
return BaseT::getMaxVScale();
}
Optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
if (ST->hasVInstructions())
return ST->getRealMinVLen() / RISCV::RVVBitsPerBlock;
return BaseT::getVScaleForTuning();
}
TypeSize
RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned LMUL = PowerOf2Floor(

View File

@ -57,6 +57,7 @@ public:
bool shouldExpandReduction(const IntrinsicInst *II) const;
bool supportsScalableVectors() const { return ST->hasVInstructions(); }
Optional<unsigned> getMaxVScale() const;
Optional<unsigned> getVScaleForTuning() const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;

View File

@ -157,35 +157,53 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
;
; VLEN128-LABEL: @vector_add_i32(
; VLEN128-NEXT: entry:
; VLEN128-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; VLEN128: vector.ph:
; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[V:%.*]], i32 0
; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; VLEN128-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[V]], i32 0
; VLEN128-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.*]], i32 0
; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; VLEN128-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V]], i32 0
; VLEN128-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]]
; VLEN128: vector.body:
; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; VLEN128-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; VLEN128-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
; VLEN128-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
; VLEN128-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
; VLEN128-NEXT: [[TMP6:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; VLEN128-NEXT: [[TMP7:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
; VLEN128-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP4]], align 4
; VLEN128-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP5]], align 4
; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; VLEN128-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; VLEN128-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; VLEN128-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; VLEN128-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; VLEN128-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
; VLEN128-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
; VLEN128-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
; VLEN128-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
; VLEN128-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 4
; VLEN128-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
; VLEN128-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 2
; VLEN128-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP14]]
; VLEN128-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP15]], align 4
; VLEN128-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; VLEN128-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
; VLEN128-NEXT: store <vscale x 2 x i32> [[TMP16]], ptr [[TMP12]], align 4
; VLEN128-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
; VLEN128-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 2
; VLEN128-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP19]]
; VLEN128-NEXT: store <vscale x 2 x i32> [[TMP17]], ptr [[TMP20]], align 4
; VLEN128-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
; VLEN128-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
; VLEN128-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; VLEN128-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; VLEN128: middle.block:
; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; VLEN128: scalar.ph:
; VLEN128-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; VLEN128-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; VLEN128-NEXT: br label [[FOR_BODY:%.*]]
; VLEN128: for.body:
; VLEN128-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]