forked from OSchip/llvm-project
[CostModel][X86] Add support for broadcast shuffle costs
Currently only for broadcasts with input and output of the same width. Differential Revision: https://reviews.llvm.org/D27811 llvm-svn: 291122
This commit is contained in:
parent
406acdba61
commit
bca02f9e20
|
@ -605,7 +605,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
// 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
|
||||
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
|
||||
|
||||
if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
|
||||
if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate ||
|
||||
Kind == TTI::SK_Broadcast) {
|
||||
// For Broadcasts we are splatting the first element from the first input
|
||||
// register, so only need to reference that input and all the output
|
||||
// registers are the same.
|
||||
if (Kind == TTI::SK_Broadcast)
|
||||
LT.first = 1;
|
||||
|
||||
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
|
||||
{ TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
|
||||
{ TTI::SK_Reverse, MVT::v32i8, 1 } // vpermb
|
||||
|
@ -617,10 +624,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
return LT.first * Entry->Cost;
|
||||
|
||||
static const CostTblEntry AVX512BWShuffleTbl[] = {
|
||||
{ TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
|
||||
{ TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
|
||||
{ TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
|
||||
// + 2*pshufb + vinserti64x4
|
||||
{ TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
|
||||
{ TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
|
||||
|
||||
{ TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
|
||||
{ TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
|
||||
{ TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
|
||||
// + 2*pshufb + vinserti64x4
|
||||
};
|
||||
|
||||
if (ST->hasBWI())
|
||||
|
@ -629,10 +639,15 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
return LT.first * Entry->Cost;
|
||||
|
||||
static const CostTblEntry AVX512ShuffleTbl[] = {
|
||||
{ TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
|
||||
{ TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
|
||||
{ TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
|
||||
{ TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
|
||||
{ TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
|
||||
{ TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
|
||||
{ TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
|
||||
{ TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
|
||||
|
||||
{ TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
|
||||
{ TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
|
||||
{ TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
|
||||
{ TTI::SK_Reverse, MVT::v16i32, 1 } // vpermd
|
||||
};
|
||||
|
||||
if (ST->hasAVX512())
|
||||
|
@ -641,6 +656,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
return LT.first * Entry->Cost;
|
||||
|
||||
static const CostTblEntry AVX2ShuffleTbl[] = {
|
||||
{ TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
|
||||
{ TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
|
||||
{ TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
|
||||
{ TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
|
||||
{ TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
|
||||
{ TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
|
||||
|
||||
{ TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
|
||||
{ TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
|
||||
{ TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
|
||||
|
@ -657,6 +679,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
return LT.first * Entry->Cost;
|
||||
|
||||
static const CostTblEntry AVX1ShuffleTbl[] = {
|
||||
{ TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
|
||||
{ TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
|
||||
{ TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
|
||||
{ TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
|
||||
{ TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
|
||||
{ TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
|
||||
|
||||
{ TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
|
||||
{ TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
|
||||
{ TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
|
||||
|
@ -692,6 +721,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
return LT.first * Entry->Cost;
|
||||
|
||||
static const CostTblEntry SSSE3ShuffleTbl[] = {
|
||||
{ TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
|
||||
{ TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
|
||||
|
||||
{ TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
|
||||
{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
|
||||
|
||||
|
@ -704,6 +736,12 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
return LT.first * Entry->Cost;
|
||||
|
||||
static const CostTblEntry SSE2ShuffleTbl[] = {
|
||||
{ TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
|
||||
{ TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
|
||||
{ TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
|
||||
{ TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
|
||||
{ TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
|
||||
|
||||
{ TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
|
||||
{ TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
|
||||
{ TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
|
||||
|
@ -723,6 +761,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|||
return LT.first * Entry->Cost;
|
||||
|
||||
static const CostTblEntry SSE1ShuffleTbl[] = {
|
||||
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
|
||||
{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
|
||||
{ TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
|
||||
};
|
||||
|
|
|
@ -18,14 +18,150 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
|
|||
%V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
|
||||
%V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
|
||||
%V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_vXi64'
|
||||
define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
|
||||
; SSE: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
|
||||
%V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
|
||||
%V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
|
||||
%V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_vXf32'
|
||||
define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
|
||||
; SSE: cost of 1 {{.*}} %V64 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V64 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V64 = shufflevector
|
||||
%V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
|
||||
%V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
|
||||
%V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
|
||||
%V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_vXi32'
|
||||
define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
|
||||
; SSE: cost of 1 {{.*}} %V64 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V64 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V64 = shufflevector
|
||||
%V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
|
||||
%V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
|
||||
%V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
|
||||
|
||||
; SSE: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
|
||||
%V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_vXi16'
|
||||
define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
|
||||
; SSE2: cost of 2 {{.*}} %V128 = shufflevector
|
||||
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
|
||||
%V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
|
||||
|
||||
; SSE2: cost of 2 {{.*}} %V256 = shufflevector
|
||||
; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; SSE42: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX1: cost of 3 {{.*}} %V256 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
|
||||
%V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
|
||||
|
||||
; SSE2: cost of 2 {{.*}} %V512 = shufflevector
|
||||
; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; SSE42: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX1: cost of 3 {{.*}} %V512 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
|
||||
%V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_vXi8'
|
||||
define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
|
||||
; SSE2: cost of 3 {{.*}} %V128 = shufflevector
|
||||
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX: cost of 1 {{.*}} %V128 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
|
||||
%V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
|
||||
|
||||
; SSE2: cost of 3 {{.*}} %V256 = shufflevector
|
||||
; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; SSE42: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V256 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
|
||||
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
|
||||
%V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
|
||||
|
||||
; SSE2: cost of 3 {{.*}} %V512 = shufflevector
|
||||
; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; SSE42: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX1: cost of 2 {{.*}} %V512 = shufflevector
|
||||
; AVX2: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
|
||||
; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
|
||||
%V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
|
||||
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue