From 6968520c3b04afa7cf38ed8db24c4f15970580d5 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 27 Apr 2021 15:46:03 +0100 Subject: [PATCH] Revert "[LoopVectorize] Simplify scalar cost calculation in getInstructionCost" This reverts commit 4afeda9157cffd2daa83f8075d73f1e11ea34c81. --- .../Transforms/Vectorize/LoopVectorize.cpp | 64 ++++++++----------- .../AArch64/no_vector_instructions.ll | 2 +- .../AArch64/predication_costs.ll | 35 ---------- 3 files changed, 27 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ecbb0befcf71..17e9947c1c63 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7316,37 +7316,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); + VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto hasSingleCopyAfterVectorization = [this](Instruction *I, - ElementCount VF) -> bool { - if (VF.isScalar()) - return true; - - auto Scalarized = InstsToScalarize.find(VF); - assert(Scalarized != InstsToScalarize.end() && - "VF not yet analyzed for scalarization profitability"); - return !Scalarized->second.count(I) && - llvm::all_of(I->users(), [&](User *U) { - auto *UI = cast(U); - return !Scalarized->second.count(UI); - }); - }; - - if (isScalarAfterVectorization(I, VF)) { - // With the exception of GEPs and PHIs, after scalarization there should - // only be one copy of the instruction generated in the loop. This is - // because the VF is either 1, or any instructions that need scalarizing - // have already been dealt with by the the time we get here. As a result, - // it means we don't have to multiply the instruction cost by VF. - assert(I->getOpcode() == Instruction::GetElementPtr || - I->getOpcode() == Instruction::PHI || - hasSingleCopyAfterVectorization(I, VF)); - VectorTy = RetTy; - } else - VectorTy = ToVectorTy(RetTy, VF); - // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: @@ -7474,16 +7447,21 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - return TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + return N * TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { assert(!VF.isScalable() && "VF is assumed to be non scalable."); - return TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None, I->getOperand(0), I); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; + return N * TTI.getArithmeticInstrCost( + I->getOpcode(), VectorTy, CostKind, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, + I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -7627,7 +7605,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } } - return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); + unsigned N; + if (isScalarAfterVectorization(I, VF)) { + assert(!VF.isScalable() && "VF is assumed to be non scalable"); + N = VF.getKnownMinValue(); + } else + N = 1; + return N * + TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { bool NeedToScalarize; @@ -7642,8 +7627,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, case Instruction::ExtractValue: return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); default: - // This opcode is unknown. Assume that it is the same as 'mul'. - return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); + // The cost of executing VF copies of the scalar instruction. This opcode + // is unknown. Assume that it is the same as 'mul'. + return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( + Instruction::Mul, VectorTy, CostKind) + + getScalarizationOverhead(I, VF); } // end of switch. } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll index 3061998518ad..247ea35ff5d0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -6,7 +6,7 @@ target triple = "aarch64--linux-gnu" ; CHECK-LABEL: all_scalar ; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 -; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 ; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions ; define void @all_scalar(i64* %a, i64 %n) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll index 858b28ddd321..b0ebb4edf2ad 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -86,41 +86,6 @@ for.end: ret void } -; CHECK-LABEL: predicated_store_phi -; -; Same as predicate_store except we use a pointer PHI to maintain the address -; -; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ] -; CHECK: Found new scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1 -; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4 -; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ] -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4 -; -define void @predicated_store_phi(i32* %a, i1 %c, i32 %x, i64 %n) { -entry: - br label %for.body - -for.body: - %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] - %addr = phi i32 * [ %a, %entry ], [ %addr.next, %for.inc ] - %tmp1 = load i32, i32* %addr, align 4 - %tmp2 = add nsw i32 %tmp1, %x - br i1 %c, label %if.then, label %for.inc - -if.then: - store i32 %tmp2, i32* %addr, align 4 - br label %for.inc - -for.inc: - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - %addr.next = getelementptr inbounds i32, i32* %addr, i64 1 - br i1 %cond, label %for.body, label %for.end - -for.end: - ret void -} - ; CHECK-LABEL: predicated_udiv_scalarized_operand ; ; This test checks that we correctly compute the cost of the predicated udiv