forked from OSchip/llvm-project
[AArch64][CostModel] Detects that {extract,insert}-element at lane 0 has the same cost as the other lane for vector instructions in the IR.
Currently, {extract,insert}-element has zero cost at lane 0 [1]. However, there is a cost (by fmov instruction [2], or ext/ins instruction) to move values from SIMD registers to GPR registers, when the element is used explicitly as integers. See https://godbolt.org/z/faPE1nTn8, when fmov is generated for d* register -> x* register conversion. Implementation-wise, add a private method `AArch64TTIImpl::getVectorInstrCostHelper` as a helper function. This way, instruction-based method could share the core logic (e.g., returning zero cost if type is legalized to scalar). [1]2cf320d41e/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (L1853)
[2]2cf320d41e/llvm/lib/Target/AArch64/AArch64InstrInfo.td (L8150-L8157)
Differential Revision: https://reviews.llvm.org/D128302
This commit is contained in:
parent
4d50a39240
commit
8aa800614b
|
@ -1968,8 +1968,9 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
|
|||
return 0;
|
||||
}
|
||||
|
||||
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
||||
unsigned Index) {
|
||||
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
|
||||
unsigned Index,
|
||||
bool HasRealUse) {
|
||||
assert(Val->isVectorTy() && "This must be a vector type");
|
||||
|
||||
if (Index != -1U) {
|
||||
|
@ -1988,7 +1989,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
|||
}
|
||||
|
||||
// The element at index zero is already inside the vector.
|
||||
if (Index == 0)
|
||||
// - For a physical (HasRealUse==true) insert-element or extract-element
|
||||
// instruction that extracts integers, an explicit FPR -> GPR move is
|
||||
// needed. So it has non-zero cost.
|
||||
// - For the rest of cases (virtual instruction or element type is float),
|
||||
// consider the instruction free.
|
||||
//
|
||||
// FIXME:
|
||||
// If the extract-element and insert-element instructions could be
|
||||
// simplified away (e.g., could be combined into users by looking at use-def
|
||||
// context), they have no cost. This is not done in the first place for
|
||||
// compile-time considerations.
|
||||
if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1996,6 +2008,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
|||
return ST->getVectorInsertExtractBaseCost();
|
||||
}
|
||||
|
||||
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
||||
unsigned Index) {
|
||||
return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
|
||||
}
|
||||
|
||||
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
|
||||
Type *Val, unsigned Index) {
|
||||
return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
|
||||
}
|
||||
|
||||
InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
|
||||
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
|
||||
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
|
||||
|
|
|
@ -59,6 +59,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
|
|||
bool isWideningInstruction(Type *Ty, unsigned Opcode,
|
||||
ArrayRef<const Value *> Args);
|
||||
|
||||
// A helper function called by 'getVectorInstrCost'.
|
||||
//
|
||||
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
|
||||
// indicates whether the vector instruction is available in the input IR or
|
||||
// just imaginary in vectorizer passes.
|
||||
InstructionCost getVectorInstrCostHelper(Type *Val, unsigned Index,
|
||||
bool HasRealUse);
|
||||
|
||||
public:
|
||||
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
|
||||
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
|
||||
|
@ -173,9 +181,10 @@ public:
|
|||
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
|
||||
const Instruction *I = nullptr);
|
||||
|
||||
using BaseT::getVectorInstrCost;
|
||||
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
|
||||
unsigned Index);
|
||||
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
|
||||
unsigned Index);
|
||||
|
||||
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
|
||||
bool IsUnsigned,
|
||||
|
|
|
@ -6,18 +6,16 @@ target triple = "aarch64--linux-gnu"
|
|||
; CHECK-LABEL: vectorInstrCost
|
||||
define void @vectorInstrCost() {
|
||||
|
||||
; Vector extracts - extracting the first element should have a zero cost;
|
||||
; all other elements should have a cost of two.
|
||||
; Vector extracts - extracting elements should have a cost of two.
|
||||
;
|
||||
; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
|
||||
%t1 = extractelement <2 x i64> undef, i32 0
|
||||
%t2 = extractelement <2 x i64> undef, i32 1
|
||||
|
||||
; Vector inserts - inserting the first element should have a zero cost; all
|
||||
; other elements should have a cost of two.
|
||||
; Vector inserts - inserting elements should have a cost of two.
|
||||
;
|
||||
; CHECK: cost of 0 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 1
|
||||
%t3 = insertelement <2 x i64> poison, i64 undef, i32 0
|
||||
%t4 = insertelement <2 x i64> poison, i64 undef, i32 1
|
||||
|
|
|
@ -6,45 +6,18 @@ target triple = "aarch64--linux-gnu"
|
|||
; CHECK-LABEL: vectorInstrCost
|
||||
define void @vectorInstrCost() {
|
||||
|
||||
; Vector extracts - extracting the first element should have a zero cost;
|
||||
; all other elements should have a cost of two.
|
||||
; Vector extracts - extracting elements should have a cost of two.
|
||||
;
|
||||
; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
|
||||
%t1 = extractelement <2 x i64> undef, i32 0
|
||||
%t2 = extractelement <2 x i64> undef, i32 1
|
||||
|
||||
; Vector inserts - inserting the first element should have a zero cost; all
|
||||
; other elements should have a cost of two.
|
||||
; Vector inserts - inserting elements should have a cost of two.
|
||||
;
|
||||
; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
|
||||
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
|
||||
%t3 = insertelement <2 x i64> undef, i64 undef, i32 0
|
||||
%t4 = insertelement <2 x i64> undef, i64 undef, i32 1
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: vectorInstrExtractCost
|
||||
define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) {
|
||||
|
||||
; Vector extracts - extracting each element at index 0 is considered
|
||||
; free in the current implementation. When extracting element at index
|
||||
; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as
|
||||
; well.
|
||||
;
|
||||
; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1
|
||||
; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2
|
||||
%t1 = extractelement <4 x i64> %vecreg, i32 1
|
||||
%t2 = extractelement <4 x i64> %vecreg, i32 2
|
||||
%ele = add i64 %t2, 1
|
||||
%cond = icmp eq i64 %t1, %ele
|
||||
|
||||
; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0
|
||||
; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3
|
||||
%t0 = extractelement <4 x i64> %vecreg, i32 0
|
||||
%t3 = extractelement <4 x i64> %vecreg, i32 3
|
||||
%val = select i1 %cond, i64 %t0 , i64 %t3
|
||||
|
||||
ret i64 %val
|
||||
}
|
||||
|
|
|
@ -9,10 +9,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
|||
|
||||
define void @ins_el0() #0 {
|
||||
; CHECK-DEFAULT-LABEL: 'ins_el0'
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
|
@ -27,10 +27,10 @@ define void @ins_el0() #0 {
|
|||
; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
; CHECK-HIGH-LABEL: 'ins_el0'
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
|
@ -84,10 +84,10 @@ define void @ins_el1() #0 {
|
|||
|
||||
define void @ext_el0() #0 {
|
||||
; CHECK-DEFAULT-LABEL: 'ext_el0'
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
|
||||
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
|
@ -102,10 +102,10 @@ define void @ext_el0() #0 {
|
|||
; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
; CHECK-HIGH-LABEL: 'ext_el0'
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
|
||||
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
|
|
|
@ -18,24 +18,23 @@ define i1 @func(ptr %0, i64 %1) {
|
|||
; CHECK-NEXT: [[TMP12]] = add i64 [[TMP4]], 1
|
||||
; CHECK-NEXT: br label [[TMP3]]
|
||||
; CHECK: .split.loop.exit:
|
||||
; CHECK-NEXT: [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ]
|
||||
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ]
|
||||
; CHECK-NEXT: [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ]
|
||||
; CHECK-NEXT: [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], -1
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]]
|
||||
; CHECK-NEXT: br label [[TMP17:%.*]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]]
|
||||
; CHECK-NEXT: br label [[TMP16:%.*]]
|
||||
; CHECK: .split.loop.exit2:
|
||||
; CHECK-NEXT: [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ]
|
||||
; CHECK-NEXT: [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ]
|
||||
; CHECK-NEXT: br label [[TMP17]]
|
||||
; CHECK: 17:
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = xor i1 [[TMP18]], true
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]]
|
||||
; CHECK-NEXT: ret i1 [[TMP21]]
|
||||
; CHECK-NEXT: br label [[TMP16]]
|
||||
; CHECK: 16:
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP17]], true
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]]
|
||||
; CHECK-NEXT: ret i1 [[TMP20]]
|
||||
;
|
||||
br label %3
|
||||
|
||||
|
|
Loading…
Reference in New Issue