forked from OSchip/llvm-project
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2.
llvm-svn: 167347
This commit is contained in:
parent
c59571878c
commit
c2345cbe73
|
@ -51,7 +51,7 @@ public:
|
|||
};
|
||||
|
||||
class VectorTargetTransformImpl : public VectorTargetTransformInfo {
|
||||
private:
|
||||
protected:
|
||||
const TargetLowering *TLI;
|
||||
|
||||
/// Estimate the cost of type-legalization and the legalized type.
|
||||
|
|
|
@ -17504,3 +17504,73 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
|
|||
|
||||
return Res;
|
||||
}
|
||||
|
||||
unsigned
|
||||
X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
|
||||
Type *Ty) const {
|
||||
const X86Subtarget &ST =
|
||||
TLI->getTargetMachine().getSubtarget<X86Subtarget>();
|
||||
|
||||
// Fix some of the inaccuracies of the target independent estimation.
|
||||
if (Ty->isVectorTy() && ST.hasSSE41()) {
|
||||
unsigned NumElem = Ty->getVectorNumElements();
|
||||
unsigned SizeInBits = Ty->getScalarType()->getScalarSizeInBits();
|
||||
|
||||
bool Is2 = (NumElem == 2);
|
||||
bool Is4 = (NumElem == 4);
|
||||
bool Is8 = (NumElem == 8);
|
||||
bool Is32bits = (SizeInBits == 32);
|
||||
bool Is64bits = (SizeInBits == 64);
|
||||
bool HasAvx = ST.hasAVX();
|
||||
bool HasAvx2 = ST.hasAVX2();
|
||||
|
||||
switch (Opcode) {
|
||||
case Instruction::Add:
|
||||
case Instruction::Sub:
|
||||
case Instruction::Mul: {
|
||||
// Only AVX2 has support for 8-wide integer operations.
|
||||
if (Is32bits && (Is4 || (Is8 && HasAvx2))) return 1;
|
||||
if (Is64bits && (Is2 || (Is4 && HasAvx2))) return 1;
|
||||
|
||||
// We don't have to completly scalarize unsupported ops. We can
|
||||
// issue two half-sized operations (with some overhead).
|
||||
// We don't need to extract the lower part of the YMM to the XMM.
|
||||
// Extract the upper, two ops, insert the upper = 4.
|
||||
if (Is32bits && Is8 && HasAvx) return 4;
|
||||
if (Is64bits && Is4 && HasAvx) return 4;
|
||||
break;
|
||||
}
|
||||
case Instruction::FAdd:
|
||||
case Instruction::FSub:
|
||||
case Instruction::FMul: {
|
||||
// AVX has support for 8-wide float operations.
|
||||
if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
|
||||
if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
|
||||
break;
|
||||
}
|
||||
case Instruction::Shl:
|
||||
case Instruction::LShr:
|
||||
case Instruction::AShr:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor: {
|
||||
// AVX has support for 8-wide integer bitwise operations.
|
||||
if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
|
||||
if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
|
||||
}
|
||||
|
||||
unsigned
|
||||
X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
|
||||
unsigned Index) const {
|
||||
// Floating point scalars are already located in index #0.
|
||||
if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
|
||||
return 0;
|
||||
return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
|
||||
}
|
||||
|
||||
|
|
|
@ -953,13 +953,10 @@ namespace llvm {
|
|||
explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
|
||||
VectorTargetTransformImpl(TL) {}
|
||||
|
||||
virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
|
||||
|
||||
virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
|
||||
unsigned Index) const {
|
||||
// Floating point scalars are already located in index #0.
|
||||
if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
|
||||
return 0;
|
||||
return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
|
||||
}
|
||||
unsigned Index) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
define i32 @add(i32 %arg) {
|
||||
;CHECK: cost of 1 {{.*}} add
|
||||
%A = add <4 x i32> undef, undef
|
||||
;CHECK: cost of 4 {{.*}} add
|
||||
%B = add <8 x i32> undef, undef
|
||||
;CHECK: cost of 1 {{.*}} add
|
||||
%C = add <2 x i64> undef, undef
|
||||
;CHECK: cost of 4 {{.*}} add
|
||||
%D = add <4 x i64> undef, undef
|
||||
;CHECK: cost of 1 {{.*}} ret
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
|
||||
define i32 @xor(i32 %arg) {
|
||||
;CHECK: cost of 1 {{.*}} xor
|
||||
%A = xor <4 x i32> undef, undef
|
||||
;CHECK: cost of 1 {{.*}} xor
|
||||
%B = xor <8 x i32> undef, undef
|
||||
;CHECK: cost of 1 {{.*}} xor
|
||||
%C = xor <2 x i64> undef, undef
|
||||
;CHECK: cost of 1 {{.*}} xor
|
||||
%D = xor <4 x i64> undef, undef
|
||||
;CHECK: cost of 1 {{.*}} ret
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
|
||||
define i32 @fmul(i32 %arg) {
|
||||
;CHECK: cost of 1 {{.*}} fmul
|
||||
%A = fmul <4 x float> undef, undef
|
||||
;CHECK: cost of 1 {{.*}} fmul
|
||||
%B = fmul <8 x float> undef, undef
|
||||
ret i32 undef
|
||||
}
|
|
@ -30,10 +30,12 @@ vector.body: ; preds = %for.body.lr.ph, %ve
|
|||
%5 = bitcast i32* %4 to <8 x i32>*
|
||||
;CHECK: cost of 1 {{.*}} load
|
||||
%6 = load <8 x i32>* %5, align 4
|
||||
;CHECK: cost of 4 {{.*}} mul
|
||||
%7 = mul nsw <8 x i32> %6, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
|
||||
%8 = getelementptr inbounds i32* %A, i64 %index
|
||||
%9 = bitcast i32* %8 to <8 x i32>*
|
||||
%10 = load <8 x i32>* %9, align 4
|
||||
;CHECK: cost of 4 {{.*}} add
|
||||
%11 = add nsw <8 x i32> %10, %7
|
||||
;CHECK: cost of 1 {{.*}} store
|
||||
store <8 x i32> %11, <8 x i32>* %9, align 4
|
||||
|
|
Loading…
Reference in New Issue