From 6968520c3b04afa7cf38ed8db24c4f15970580d5 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 27 Apr 2021 15:46:03 +0100
Subject: [PATCH] Revert "[LoopVectorize] Simplify scalar cost calculation in
 getInstructionCost"

This reverts commit 4afeda9157cffd2daa83f8075d73f1e11ea34c81.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 64 ++++++++-----------
 .../AArch64/no_vector_instructions.ll         |  2 +-
 .../AArch64/predication_costs.ll              | 35 ----------
 3 files changed, 27 insertions(+), 74 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ecbb0befcf71..17e9947c1c63 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7316,37 +7316,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
+  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
   auto SE = PSE.getSE();
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  auto hasSingleCopyAfterVectorization = [this](Instruction *I,
-                                                ElementCount VF) -> bool {
-    if (VF.isScalar())
-      return true;
-
-    auto Scalarized = InstsToScalarize.find(VF);
-    assert(Scalarized != InstsToScalarize.end() &&
-           "VF not yet analyzed for scalarization profitability");
-    return !Scalarized->second.count(I) &&
-           llvm::all_of(I->users(), [&](User *U) {
-             auto *UI = cast<Instruction>(U);
-             return !Scalarized->second.count(UI);
-           });
-  };
-
-  if (isScalarAfterVectorization(I, VF)) {
-    // With the exception of GEPs and PHIs, after scalarization there should
-    // only be one copy of the instruction generated in the loop. This is
-    // because the VF is either 1, or any instructions that need scalarizing
-    // have already been dealt with by the the time we get here. As a result,
-    // it means we don't have to multiply the instruction cost by VF.
-    assert(I->getOpcode() == Instruction::GetElementPtr ||
-           I->getOpcode() == Instruction::PHI ||
-           hasSingleCopyAfterVectorization(I, VF));
-    VectorTy = RetTy;
-  } else
-    VectorTy = ToVectorTy(RetTy, VF);
-
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
@@ -7474,16 +7447,21 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       Op2VK = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    return TTI.getArithmeticInstrCost(
-        I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
-        Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+    return N * TTI.getArithmeticInstrCost(
+                   I->getOpcode(), VectorTy, CostKind,
+                   TargetTransformInfo::OK_AnyValue,
+                   Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
   }
   case Instruction::FNeg: {
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    return TTI.getArithmeticInstrCost(
-        I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
-        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
-        TargetTransformInfo::OP_None, I->getOperand(0), I);
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
+    return N * TTI.getArithmeticInstrCost(
+                   I->getOpcode(), VectorTy, CostKind,
+                   TargetTransformInfo::OK_AnyValue,
+                   TargetTransformInfo::OK_AnyValue,
+                   TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
+                   I->getOperand(0), I);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -7627,7 +7605,14 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       }
     }
 
-    return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
+    unsigned N;
+    if (isScalarAfterVectorization(I, VF)) {
+      assert(!VF.isScalable() && "VF is assumed to be non scalable");
+      N = VF.getKnownMinValue();
+    } else
+      N = 1;
+    return N *
+           TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
@@ -7642,8 +7627,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   case Instruction::ExtractValue:
     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
   default:
-    // This opcode is unknown. Assume that it is the same as 'mul'.
-    return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+    // The cost of executing VF copies of the scalar instruction. This opcode
+    // is unknown. Assume that it is the same as 'mul'.
+    return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
+                                       Instruction::Mul, VectorTy, CostKind) +
+           getScalarizationOverhead(I, VF);
   } // end of switch.
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
index 3061998518ad..247ea35ff5d0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -6,7 +6,7 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-LABEL: all_scalar
 ; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
 ; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
 ;
 define void @all_scalar(i64* %a, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index 858b28ddd321..b0ebb4edf2ad 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -86,41 +86,6 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: predicated_store_phi
-;
-; Same as predicate_store except we use a pointer PHI to maintain the address
-;
-; CHECK: Found new scalar instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
-; CHECK: Found new scalar instruction:   %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
-; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
-; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
-; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4
-;
-define void @predicated_store_phi(i32* %a, i1 %c, i32 %x, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
-  %addr = phi i32 * [ %a, %entry ], [ %addr.next, %for.inc ]
-  %tmp1 = load i32, i32* %addr, align 4
-  %tmp2 = add nsw i32 %tmp1, %x
-  br i1 %c, label %if.then, label %for.inc
-
-if.then:
-  store i32 %tmp2, i32* %addr, align 4
-  br label %for.inc
-
-for.inc:
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
 ; CHECK-LABEL: predicated_udiv_scalarized_operand
 ;
 ; This test checks that we correctly compute the cost of the predicated udiv