forked from OSchip/llvm-project
[PowerPC] Adjust load/store costs in PPCTTI
This provides more realistic costs for the insert/extractelement instructions (which are load/store pairs), accounts for the cheap unaligned Altivec load sequence, and for unaligned VSX load/stores. Bad news: MultiSource/Applications/sgefa/sgefa - 35% slowdown (this will require more investigation) SingleSource/Benchmarks/McGill/queens - 20% slowdown (we no longer vectorize this, but it was a constant store that was scalarized) MultiSource/Benchmarks/FreeBench/pcompress2/pcompress2 - 2% slowdown Good news: SingleSource/Benchmarks/Shootout/ary3 - 54% speedup SingleSource/Benchmarks/Shootout-C++/ary - 40% speedup MultiSource/Benchmarks/Ptrdist/ks/ks - 35% speedup MultiSource/Benchmarks/FreeBench/neural/neural - 30% speedup MultiSource/Benchmarks/TSVC/Symbolics-flt/Symbolics-flt - 20% speedup Unfortunately, estimating the costs of the stack-based scalarization sequences is hard, and adjusting these costs is like a game of whac-a-mole :( I'll revisit this again after we have better codegen for vector extloads and truncstores and unaligned load/stores. llvm-svn: 205658
This commit is contained in:
parent
b1308d525c
commit
de0b413ec0
|
@ -216,7 +216,9 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
|
|||
// experimentally as a minimum needed to prevent unprofitable
|
||||
// vectorization for the paq8p benchmark. It may need to be
|
||||
// raised further if other unprofitable cases remain.
|
||||
unsigned LHSPenalty = 12;
|
||||
unsigned LHSPenalty = 2;
|
||||
if (ISD == ISD::INSERT_VECTOR_ELT)
|
||||
LHSPenalty += 7;
|
||||
|
||||
// Vector element insert/extract with Altivec is very expensive,
|
||||
// because they require store and reload with the attendant
|
||||
|
@ -240,14 +242,32 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
|||
unsigned Cost =
|
||||
TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
||||
|
||||
// FIXME: Update this for VSX loads/stores that support unaligned access.
|
||||
// VSX loads/stores support unaligned access.
|
||||
if (ST->hasVSX()) {
|
||||
if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
|
||||
return Cost;
|
||||
}
|
||||
|
||||
bool UnalignedAltivec =
|
||||
Src->isVectorTy() &&
|
||||
Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
|
||||
LT.second.getSizeInBits() == 128 &&
|
||||
Opcode == Instruction::Load;
|
||||
|
||||
// PPC in general does not support unaligned loads and stores. They'll need
|
||||
// to be decomposed based on the alignment factor.
|
||||
unsigned SrcBytes = LT.second.getStoreSize();
|
||||
if (SrcBytes && Alignment && Alignment < SrcBytes)
|
||||
if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
|
||||
Cost += LT.first*(SrcBytes/Alignment-1);
|
||||
|
||||
// For a vector type, there is also scalarization overhead (only for
|
||||
// stores, loads are expanded using the vector-load + permutation sequence,
|
||||
// which is much less expensive).
|
||||
if (Src->isVectorTy() && Opcode == Instruction::Store)
|
||||
for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
|
||||
Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
|
||||
}
|
||||
|
||||
return Cost;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ define void @exts() {
|
|||
; CHECK: cost of 1 {{.*}} sext
|
||||
%v3 = sext <4 x i16> undef to <4 x i32>
|
||||
|
||||
; CHECK: cost of 216 {{.*}} sext
|
||||
; CHECK: cost of 112 {{.*}} sext
|
||||
%v4 = sext <8 x i16> undef to <8 x i32>
|
||||
|
||||
ret void
|
||||
|
|
|
@ -3,13 +3,13 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
|
|||
target triple = "powerpc64-unknown-linux-gnu"
|
||||
|
||||
define i32 @insert(i32 %arg) {
|
||||
; CHECK: cost of 13 {{.*}} insertelement
|
||||
; CHECK: cost of 10 {{.*}} insertelement
|
||||
%x = insertelement <4 x i32> undef, i32 %arg, i32 0
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
define i32 @extract(<4 x i32> %arg) {
|
||||
; CHECK: cost of 13 {{.*}} extractelement
|
||||
; CHECK: cost of 3 {{.*}} extractelement
|
||||
%x = extractelement <4 x i32> %arg, i32 0
|
||||
ret i32 %x
|
||||
}
|
||||
|
|
|
@ -31,9 +31,12 @@ define i32 @loads(i32 %arg) {
|
|||
|
||||
; FIXME: There actually are sub-vector Altivec loads, and so we could handle
|
||||
; this with a small expense, but we don't currently.
|
||||
; CHECK: cost of 60 {{.*}} load
|
||||
; CHECK: cost of 48 {{.*}} load
|
||||
load <4 x i16>* undef, align 2
|
||||
|
||||
; CHECK: cost of 1 {{.*}} load
|
||||
load <4 x i32>* undef, align 4
|
||||
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue