forked from OSchip/llvm-project
Refine fix to bug 15041.
Thanks to help from Nadav and Hal, I have a more reasonable (and even correct!) approach. This specifically penalizes the insertelement and extractelement operations for the performance hit that will occur on PowerPC processors. llvm-svn: 174725
This commit is contained in:
parent
22d275f7b8
commit
62fe7a5b17
|
@ -194,24 +194,23 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
|
|||
unsigned Index) const {
|
||||
assert(Val->isVectorTy() && "This must be a vector type");
|
||||
|
||||
const unsigned Awful = 1000;
|
||||
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
||||
assert(ISD && "Invalid opcode");
|
||||
|
||||
// Vector element insert/extract with Altivec is very expensive.
|
||||
// Until VSX is available, avoid vectorizing loops that require
|
||||
// these operations.
|
||||
if (Opcode == ISD::EXTRACT_VECTOR_ELT ||
|
||||
Opcode == ISD::INSERT_VECTOR_ELT)
|
||||
return Awful;
|
||||
// Estimated cost of a load-hit-store delay. This was obtained
|
||||
// experimentally as a minimum needed to prevent unprofitable
|
||||
// vectorization for the paq8p benchmark. It may need to be
|
||||
// raised further if other unprofitable cases remain.
|
||||
unsigned LHSPenalty = 12;
|
||||
|
||||
// We don't vectorize SREM/UREM so well. Constrain the vectorizer
|
||||
// for those as well.
|
||||
if (Opcode == ISD::SREM || Opcode == ISD::UREM)
|
||||
return Awful;
|
||||
|
||||
// VSELECT is not yet implemented, leading to use of insert/extract
|
||||
// and ISEL, hence not a good idea.
|
||||
if (Opcode == ISD::VSELECT)
|
||||
return Awful;
|
||||
// Vector element insert/extract with Altivec is very expensive,
|
||||
// because they require store and reload with the attendant
|
||||
// processor stall for load-hit-store. Until VSX is available,
|
||||
// these need to be estimated as very costly.
|
||||
if (ISD == ISD::EXTRACT_VECTOR_ELT ||
|
||||
ISD == ISD::INSERT_VECTOR_ELT)
|
||||
return LHSPenalty +
|
||||
TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
|
||||
|
||||
return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
|
||||
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
|
||||
target triple = "powerpc64-unknown-linux-gnu"
|
||||
|
||||
define i32 @insert(i32 %arg) {
|
||||
; CHECK: cost of 13 {{.*}} insertelement
|
||||
%x = insertelement <4 x i32> undef, i32 %arg, i32 0
|
||||
ret i32 undef
|
||||
}
|
||||
|
||||
define i32 @extract(<4 x i32> %arg) {
|
||||
; CHECK: cost of 13 {{.*}} extractelement
|
||||
%x = extractelement <4 x i32> %arg, i32 0
|
||||
ret i32 %x
|
||||
}
|
||||
|
Loading…
Reference in New Issue