From 9a0542a792930c11a26bc9fa7e0eaa70dfd32d16 Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Fri, 10 Jun 2016 17:01:05 +0000 Subject: [PATCH] [X86] Add costs for SSE zext/sext to v4i64 to TTI The costs are somewhat hand-wavy, but should be much closer to the truth than what we get from BasicTTI. Differential Revision: http://reviews.llvm.org/D21156 llvm-svn: 272406 --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 3 + .../lib/Target/X86/X86TargetTransformInfo.cpp | 14 ++++ llvm/test/Analysis/CostModel/X86/sse-itoi.ll | 79 +++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 669bcbc3c1be..78cb0af16996 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -412,6 +412,9 @@ public: // If we are converting vectors and the operation is illegal, or // if the vectors are legalized to different types, estimate the // scalarization costs. + // TODO: This is probably a big overestimate. For splits, we should have + // something like getTypeLegalizationCost() + 2 * getCastInstrCost(). + // The same applies to getCmpSelInstrCost() and getArithmeticInstrCost() unsigned Num = Dst->getVectorNumElements(); unsigned Cost = static_cast(this)->getCastInstrCost( Opcode, Dst->getScalarType(), Src->getScalarType()); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 1baa49c3c08d..c86790a9326f 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -709,6 +709,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { }; static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, @@ -759,6 +766,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, diff --git a/llvm/test/Analysis/CostModel/X86/sse-itoi.ll b/llvm/test/Analysis/CostModel/X86/sse-itoi.ll index 13a95a81d42d..46d993564609 100644 --- a/llvm/test/Analysis/CostModel/X86/sse-itoi.ll +++ b/llvm/test/Analysis/CostModel/X86/sse-itoi.ll @@ -1,6 +1,85 @@ ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s +define void @zext_v4i8_to_v4i64(<4 x i8>* %a) { +; SSE2: zext_v4i8_to_v4i64 +; SSE2: cost of 4 {{.*}} zext +; +; SSE41: zext_v4i8_to_v4i64 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i64> + store <4 x i64> %2, <4 x i64>* undef, align 4 + ret void +} + +define void @sext_v4i8_to_v4i64(<4 x i8>* %a) { +; SSE2: sext_v4i8_to_v4i64 +; SSE2: cost of 8 {{.*}} sext +; +; SSE41: sext_v4i8_to_v4i64 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = sext <4 x i8> %1 to <4 x i64> + store <4 x i64> %2, <4 x i64>* undef, align 4 + ret void +} + +define void @zext_v4i16_to_v4i64(<4 x i16>* %a) { +; SSE2: zext_v4i16_to_v4i64 +; SSE2: cost of 3 {{.*}} zext +; +; SSE41: zext_v4i16_to_v4i64 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = zext <4 x i16> %1 to <4 x i64> + store <4 x i64> %2, <4 x i64>* undef, align 4 + ret void +} + +define void @sext_v4i16_to_v4i64(<4 x i16>* %a) { +; SSE2: sext_v4i16_to_v4i64 +; SSE2: cost of 10 {{.*}} sext +; +; SSE41: sext_v4i16_to_v4i64 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = sext <4 x i16> %1 to <4 x i64> + store <4 x i64> %2, <4 x i64>* undef, align 4 + ret void +} + + +define void @zext_v4i32_to_v4i64(<4 x i32>* %a) { +; SSE2: zext_v4i32_to_v4i64 +; SSE2: cost of 3 {{.*}} zext +; +; SSE41: zext_v4i32_to_v4i64 +; SSE41: cost of 2 {{.*}} zext +; + %1 = load <4 x i32>, <4 x i32>* %a + %2 = zext <4 x i32> %1 to <4 x i64> + store <4 x i64> %2, <4 x i64>* undef, align 4 + ret void +} + +define void @sext_v4i32_to_v4i64(<4 x i32>* %a) { +; SSE2: sext_v4i32_to_v4i64 +; SSE2: cost of 5 {{.*}} sext +; +; SSE41: sext_v4i32_to_v4i64 +; SSE41: cost of 2 {{.*}} sext +; + %1 = load <4 x i32>, <4 x i32>* %a + %2 = sext <4 x i32> %1 to <4 x i64> + store <4 x i64> %2, <4 x i64>* undef, align 4 + ret void +} + define void @zext_v16i16_to_v16i32(<16 x i16>* %a) { ; SSE2: zext_v16i16_to_v16i32 ; SSE2: cost of 6 {{.*}} zext