From 41bd9ead35f60823c59367efe4f3d5ade87e756d Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 20 Feb 2020 10:40:30 -0800
Subject: [PATCH] [SystemZ]  Return scalarized costs for vector instructions on
 older archs.

A cost query for a vector instruction should return a cost even without
target vector support, and not trigger an assert.

VectorCombine does this with an input containing source code vectors.

Review: Ulrich Weigand
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 261 +++++++++---------
 .../CostModel/SystemZ/oldarch-vectors.ll      |  13 +
 2 files changed, 142 insertions(+), 132 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/oldarch-vectors.ll
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index acec3c533585..df5286bef817 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -391,9 +391,57 @@ int SystemZTTIImpl::getArithmeticInstrCost(
     }
   }
 
-  if (Ty->isVectorTy()) {
-    assert(ST->hasVector() &&
-           "getArithmeticInstrCost() called with vector type.");
+  if (!Ty->isVectorTy()) {
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    // Give discount for some combined logical operations if supported.
+    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+      if (Opcode == Instruction::Xor) {
+        for (const Value *A : Args) {
+          if (const Instruction *I = dyn_cast<Instruction>(A))
+            if (I->hasOneUse() &&
+                (I->getOpcode() == Instruction::And ||
+                 I->getOpcode() == Instruction::Or ||
+                 I->getOpcode() == Instruction::Xor))
+              return 0;
+        }
+      }
+      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+        for (const Value *A : Args) {
+          if (const Instruction *I = dyn_cast<Instruction>(A))
+            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
+              return 0;
+        }
+      }
+    }
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    if (Opcode == Instruction::Xor && ScalarBits == 1) {
+      if (ST->hasLoadStoreOnCond2())
+        return 5; // 2 * (li 0; loc 1); xor
+      return 7; // 2 * ipm sequences ; xor ; shift ; compare
+    }
+
+    if (DivRemConstPow2)
+      return (SignedDivRem ? SDivPow2Cost : 1);
+    if (DivRemConst)
+      return DivMulSeqCost;
+    if (SignedDivRem || UnsignedDivRem)
+      return DivInstrCost;
+  }
+  else if (ST->hasVector()) {
     unsigned VF = Ty->getVectorNumElements();
     unsigned NumVectors = getNumVectorRegs(Ty);
 
@@ -454,56 +502,6 @@ int SystemZTTIImpl::getArithmeticInstrCost(
       return Cost;
     }
   }
-  else {  // Scalar:
-    // These FP operations are supported with a dedicated instruction for
-    // float, double and fp128 (base implementation assumes float generally
-    // costs 2).
-    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
-        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
-      return 1;
-
-    // There is no native support for FRem.
-    if (Opcode == Instruction::FRem)
-      return LIBCALL_COST;
-
-    // Give discount for some combined logical operations if supported.
-    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
-      if (Opcode == Instruction::Xor) {
-        for (const Value *A : Args) {
-          if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() &&
-                (I->getOpcode() == Instruction::And ||
-                 I->getOpcode() == Instruction::Or ||
-                 I->getOpcode() == Instruction::Xor))
-              return 0;
-        }
-      }
-      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
-        for (const Value *A : Args) {
-          if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
-              return 0;
-        }
-      }
-    }
-
-    // Or requires one instruction, although it has custom handling for i64.
-    if (Opcode == Instruction::Or)
-      return 1;
-
-    if (Opcode == Instruction::Xor && ScalarBits == 1) {
-      if (ST->hasLoadStoreOnCond2())
-        return 5; // 2 * (li 0; loc 1); xor
-      return 7; // 2 * ipm sequences ; xor ; shift ; compare
-    }
-
-    if (DivRemConstPow2)
-      return (SignedDivRem ? SDivPow2Cost : 1);
-    if (DivRemConst)
-      return DivMulSeqCost;
-    if (SignedDivRem || UnsignedDivRem)
-      return DivInstrCost;
-  }
 
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
@@ -513,35 +511,36 @@ int SystemZTTIImpl::getArithmeticInstrCost(
 int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                    Type *SubTp) {
   assert (Tp->isVectorTy());
-  assert (ST->hasVector() && "getShuffleCost() called.");
-  unsigned NumVectors = getNumVectorRegs(Tp);
+  if (ST->hasVector()) {
+    unsigned NumVectors = getNumVectorRegs(Tp);
 
-  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+    // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
 
-  // FP128 values are always in scalar registers, so there is no work
-  // involved with a shuffle, except for broadcast. In that case register
-  // moves are done with a single instruction per element.
-  if (Tp->getScalarType()->isFP128Ty())
-    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+    // FP128 values are always in scalar registers, so there is no work
+    // involved with a shuffle, except for broadcast. In that case register
+    // moves are done with a single instruction per element.
+    if (Tp->getScalarType()->isFP128Ty())
+      return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
 
-  switch (Kind) {
-  case  TargetTransformInfo::SK_ExtractSubvector:
-    // ExtractSubvector Index indicates start offset.
+    switch (Kind) {
+    case  TargetTransformInfo::SK_ExtractSubvector:
+      // ExtractSubvector Index indicates start offset.
 
-    // Extracting a subvector from first index is a noop.
-    return (Index == 0 ? 0 : NumVectors);
+      // Extracting a subvector from first index is a noop.
+      return (Index == 0 ? 0 : NumVectors);
 
-  case TargetTransformInfo::SK_Broadcast:
-    // Loop vectorizer calls here to figure out the extra cost of
-    // broadcasting a loaded value to all elements of a vector. Since vlrep
-    // loads and replicates with a single instruction, adjust the returned
-    // value.
-    return NumVectors - 1;
+    case TargetTransformInfo::SK_Broadcast:
+      // Loop vectorizer calls here to figure out the extra cost of
+      // broadcasting a loaded value to all elements of a vector. Since vlrep
+      // loads and replicates with a single instruction, adjust the returned
+      // value.
+      return NumVectors - 1;
 
-  default:
+    default:
 
-    // SystemZ supports single instruction permutation / replication.
-    return NumVectors;
+      // SystemZ supports single instruction permutation / replication.
+      return NumVectors;
+    }
   }
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -672,8 +671,36 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   unsigned DstScalarBits = Dst->getScalarSizeInBits();
   unsigned SrcScalarBits = Src->getScalarSizeInBits();
 
-  if (Src->isVectorTy()) {
-    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+  if (!Src->isVectorTy()) {
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (SrcScalarBits >= 32 ||
+          (I != nullptr && isa<LoadInst>(I->getOperand(0))))
+        return 1;
+      return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
+    }
+
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        Src->isIntegerTy(1)) {
+      if (ST->hasLoadStoreOnCond2())
+        return 2; // li 0; loc 1
+
+      // This should be extension of a compare i1 result, which is done with
+      // ipm and a varying sequence of instructions.
+      unsigned Cost = 0;
+      if (Opcode == Instruction::SExt)
+        Cost = (DstScalarBits < 64 ? 3 : 4);
+      if (Opcode == Instruction::ZExt)
+        Cost = 3;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+        // If operands of an fp-type was compared, this costs +1.
+        Cost++;
+      return Cost;
+    }
+  }
+  else if (ST->hasVector()) {
     assert (Dst->isVectorTy());
     unsigned VF = Src->getVectorNumElements();
     unsigned NumDstVectors = getNumVectorRegs(Dst);
@@ -759,35 +786,6 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return VF + getScalarizationOverhead(Src, false, true);
     }
   }
-  else { // Scalar
-    assert (!Dst->isVectorTy());
-
-    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
-      if (SrcScalarBits >= 32 ||
-          (I != nullptr && isa<LoadInst>(I->getOperand(0))))
-        return 1;
-      return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
-    }
-
-    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-        Src->isIntegerTy(1)) {
-      if (ST->hasLoadStoreOnCond2())
-        return 2; // li 0; loc 1
-
-      // This should be extension of a compare i1 result, which is done with
-      // ipm and a varying sequence of instructions.
-      unsigned Cost = 0;
-      if (Opcode == Instruction::SExt)
-        Cost = (DstScalarBits < 64 ? 3 : 4);
-      if (Opcode == Instruction::ZExt)
-        Cost = 3;
-      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
-      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
-        // If operands of an fp-type was compared, this costs +1.
-        Cost++;
-      return Cost;
-    }
-  }
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
 }
@@ -806,8 +804,31 @@ static unsigned getOperandsExtensionCost(const Instruction *I) {
 
 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                        Type *CondTy, const Instruction *I) {
-  if (ValTy->isVectorTy()) {
-    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
+  if (!ValTy->isVectorTy()) {
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      // A loaded value compared with 0 with multiple users becomes Load and
+      // Test. The load is then not foldable, so return 0 cost for the ICmp.
+      unsigned ScalarBits = ValTy->getScalarSizeInBits();
+      if (I != nullptr && ScalarBits >= 32)
+        if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+          if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+            if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
+                C->getZExtValue() == 0)
+              return 0;
+
+      unsigned Cost = 1;
+      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+        Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP - costs a conditional jump.
+      return 1; // Load On Condition / Select Register.
+    }
+  }
+  else if (ST->hasVector()) {
     unsigned VF = ValTy->getVectorNumElements();
 
     // Called with a compare instruction.
@@ -856,30 +877,6 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
     }
   }
-  else { // Scalar
-    switch (Opcode) {
-    case Instruction::ICmp: {
-      // A loaded value compared with 0 with multiple users becomes Load and
-      // Test. The load is then not foldable, so return 0 cost for the ICmp.
-      unsigned ScalarBits = ValTy->getScalarSizeInBits();
-      if (I != nullptr && ScalarBits >= 32)
-        if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
-          if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
-            if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
-                C->getZExtValue() == 0)
-              return 0;
-
-      unsigned Cost = 1;
-      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
-        Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
-      return Cost;
-    }
-    case Instruction::Select:
-      if (ValTy->isFloatingPointTy())
-        return 4; // No load on condition for FP - costs a conditional jump.
-      return 1; // Load On Condition / Select Register.
-    }
-  }
 
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
 }
diff --git a/llvm/test/Analysis/CostModel/SystemZ/oldarch-vectors.ll b/llvm/test/Analysis/CostModel/SystemZ/oldarch-vectors.ll
new file mode 100644
index 000000000000..b56b4b9bdfc8
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/oldarch-vectors.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z10
+;
+; Check that some costs can be returned for vector instructions also without
+; vector support.
+
+define void @fun(<2 x double>* %arg) {
+entry:
+   %add = fadd <2 x double> undef, undef
+   shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+   %conv = fptoui <4 x float> undef to <4 x i32>
+   %cmp = icmp eq <2 x i64> undef, undef
+  ret void
+}