X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2.

llvm-svn: 167347
2012-11-03 00:39:56 +00:00 · 2012-11-03 00:39:56 +00:00 · c2345cbe73
parent c59571878c
commit c2345cbe73
5 changed files with 116 additions and 7 deletions
--- a/llvm/include/llvm/Target/TargetTransformImpl.h
+++ b/llvm/include/llvm/Target/TargetTransformImpl.h
@ -51,7 +51,7 @@ public:
 };

 class VectorTargetTransformImpl : public VectorTargetTransformInfo {
-private:
+protected:
  const TargetLowering *TLI;

  /// Estimate the cost of type-legalization and the legalized type.
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -17504,3 +17504,73 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,

  return Res;
 }
+
+unsigned
+X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
+                                                     Type *Ty) const {
+  const X86Subtarget &ST =
+  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  // Fix some of the inaccuracies of the target independent estimation.
+  if (Ty->isVectorTy() && ST.hasSSE41()) {
+    unsigned NumElem = Ty->getVectorNumElements();
+    unsigned SizeInBits = Ty->getScalarType()->getScalarSizeInBits();
+
+    bool Is2 = (NumElem == 2);
+    bool Is4 = (NumElem == 4);
+    bool Is8 = (NumElem == 8);
+    bool Is32bits = (SizeInBits == 32);
+    bool Is64bits = (SizeInBits == 64);
+    bool HasAvx = ST.hasAVX();
+    bool HasAvx2 = ST.hasAVX2();
+
+    switch (Opcode) {
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Mul: {
+        // Only AVX2 has support for 8-wide integer operations.
+        if (Is32bits && (Is4 || (Is8 && HasAvx2))) return 1;
+        if (Is64bits && (Is2 || (Is4 && HasAvx2))) return 1;
+
+        // We don't have to completly scalarize unsupported ops. We can
+        // issue two half-sized operations (with some overhead).
+        // We don't need to extract the lower part of the YMM to the XMM.
+        // Extract the upper, two ops, insert the upper = 4.
+        if (Is32bits && Is8 && HasAvx) return 4;
+        if (Is64bits && Is4 && HasAvx) return 4;
+        break;
+      }
+      case Instruction::FAdd:
+      case Instruction::FSub:
+      case Instruction::FMul: {
+        // AVX has support for 8-wide float operations.
+        if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
+        if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
+        break;
+      }
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor: {
+        // AVX has support for 8-wide integer bitwise operations.
+        if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
+        if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
+        break;
+      }
+    }
+  }
+
+  return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
+}
+
+unsigned
+X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                    unsigned Index) const {
+  // Floating point scalars are already located in index #0.
+  if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+    return 0;
+  return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
+}
+
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@ -953,13 +953,10 @@ namespace llvm {
    explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
    VectorTargetTransformImpl(TL) {}

+    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+
    virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                                        unsigned Index) const {
-      // Floating point scalars are already located in index #0.
-      if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
-        return 0;
-      return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
-    }
+                                        unsigned Index) const;
  };

 }
--- a/llvm/test/Analysis/CostModel/X86/arith.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith.ll
@ -0,0 +1,40 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} add
+  %A = add <4 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %B = add <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} add
+  %C = add <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %D = add <4 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @xor(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} xor
+  %A = xor <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %B = xor <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %C = xor <2 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %D = xor <4 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @fmul(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} fmul
+  %A = fmul <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fmul
+  %B = fmul <8 x float> undef, undef
+  ret i32 undef
+}
--- a/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll
+++ b/llvm/test/Analysis/CostModel/X86/vectorized-loop.ll
@ -30,10 +30,12 @@ vector.body:                                      ; preds = %for.body.lr.ph, %ve
  %5 = bitcast i32* %4 to <8 x i32>*
  ;CHECK: cost of 1 {{.*}} load
  %6 = load <8 x i32>* %5, align 4
+  ;CHECK: cost of 4 {{.*}} mul
  %7 = mul nsw <8 x i32> %6, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  %8 = getelementptr inbounds i32* %A, i64 %index
  %9 = bitcast i32* %8 to <8 x i32>*
  %10 = load <8 x i32>* %9, align 4
+  ;CHECK: cost of 4 {{.*}} add
  %11 = add nsw <8 x i32> %10, %7
  ;CHECK: cost of 1 {{.*}} store
  store <8 x i32> %11, <8 x i32>* %9, align 4