[AArch64][CostModel] Detects that {extract,insert}-element at lane 0 has the same cost as the other lane for vector instructions in the IR.

Currently, {extract,insert}-element has zero cost at lane 0 [1]. However, there is a cost (by fmov instruction [2], or ext/ins instruction) to move values from SIMD registers to GPR registers, when the element is used explicitly as integers.

See https://godbolt.org/z/faPE1nTn8, when fmov is generated for d* register -> x* register conversion.

Implementation-wise, add a private method `AArch64TTIImpl::getVectorInstrCostHelper` as a helper function. This way, instruction-based method could share the core logic (e.g.,
returning zero cost if type is legalized to scalar).

[1] 2cf320d41e/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (L1853)
[2] 2cf320d41e/llvm/lib/Target/AArch64/AArch64InstrInfo.td (L8150-L8157)

Differential Revision: https://reviews.llvm.org/D128302
This commit is contained in:
Mingming Liu 2022-06-21 13:38:30 -07:00
parent 4d50a39240
commit 8aa800614b
6 changed files with 71 additions and 70 deletions

View File

@ -1968,8 +1968,9 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
return 0;
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
unsigned Index) {
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
unsigned Index,
bool HasRealUse) {
assert(Val->isVectorTy() && "This must be a vector type");
if (Index != -1U) {
@ -1988,7 +1989,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
// The element at index zero is already inside the vector.
if (Index == 0)
// - For a physical (HasRealUse==true) insert-element or extract-element
// instruction that extracts integers, an explicit FPR -> GPR move is
// needed. So it has non-zero cost.
// - For the rest of cases (virtual instruction or element type is float),
// consider the instruction free.
//
// FIXME:
// If the extract-element and insert-element instructions could be
// simplified away (e.g., could be combined into users by looking at use-def
// context), they have no cost. This is not done in the first place for
// compile-time considerations.
if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
return 0;
}
@ -1996,6 +2008,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
return ST->getVectorInsertExtractBaseCost();
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
unsigned Index) {
return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val, unsigned Index) {
return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
}
InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

View File

@ -59,6 +59,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
bool isWideningInstruction(Type *Ty, unsigned Opcode,
ArrayRef<const Value *> Args);
// A helper function called by 'getVectorInstrCost'.
//
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
// indicates whether the vector instruction is available in the input IR or
// just imaginary in vectorizer passes.
InstructionCost getVectorInstrCostHelper(Type *Val, unsigned Index,
bool HasRealUse);
public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@ -173,9 +181,10 @@ public:
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
unsigned Index);
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
unsigned Index);
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsUnsigned,

View File

@ -6,18 +6,16 @@ target triple = "aarch64--linux-gnu"
; CHECK-LABEL: vectorInstrCost
define void @vectorInstrCost() {
; Vector extracts - extracting the first element should have a zero cost;
; all other elements should have a cost of two.
; Vector extracts - extracting elements should have a cost of two.
;
; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
%t1 = extractelement <2 x i64> undef, i32 0
%t2 = extractelement <2 x i64> undef, i32 1
; Vector inserts - inserting the first element should have a zero cost; all
; other elements should have a cost of two.
; Vector inserts - inserting elements should have a cost of two.
;
; CHECK: cost of 0 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 1
%t3 = insertelement <2 x i64> poison, i64 undef, i32 0
%t4 = insertelement <2 x i64> poison, i64 undef, i32 1

View File

@ -6,45 +6,18 @@ target triple = "aarch64--linux-gnu"
; CHECK-LABEL: vectorInstrCost
define void @vectorInstrCost() {
; Vector extracts - extracting the first element should have a zero cost;
; all other elements should have a cost of two.
; Vector extracts - extracting elements should have a cost of two.
;
; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
%t1 = extractelement <2 x i64> undef, i32 0
%t2 = extractelement <2 x i64> undef, i32 1
; Vector inserts - inserting the first element should have a zero cost; all
; other elements should have a cost of two.
; Vector inserts - inserting elements should have a cost of two.
;
; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
%t3 = insertelement <2 x i64> undef, i64 undef, i32 0
%t4 = insertelement <2 x i64> undef, i64 undef, i32 1
ret void
}
; CHECK-LABEL: vectorInstrExtractCost
define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) {
; Vector extracts - extracting each element at index 0 is considered
; free in the current implementation. When extracting element at index
; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as
; well.
;
; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1
; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2
%t1 = extractelement <4 x i64> %vecreg, i32 1
%t2 = extractelement <4 x i64> %vecreg, i32 2
%ele = add i64 %t2, 1
%cond = icmp eq i64 %t1, %ele
; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0
; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3
%t0 = extractelement <4 x i64> %vecreg, i32 0
%t3 = extractelement <4 x i64> %vecreg, i32 3
%val = select i1 %cond, i64 %t0 , i64 %t3
ret i64 %val
}

View File

@ -9,10 +9,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @ins_el0() #0 {
; CHECK-DEFAULT-LABEL: 'ins_el0'
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
@ -27,10 +27,10 @@ define void @ins_el0() #0 {
; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-HIGH-LABEL: 'ins_el0'
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
@ -84,10 +84,10 @@ define void @ins_el1() #0 {
define void @ext_el0() #0 {
; CHECK-DEFAULT-LABEL: 'ext_el0'
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
@ -102,10 +102,10 @@ define void @ext_el0() #0 {
; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-HIGH-LABEL: 'ext_el0'
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void

View File

@ -18,24 +18,23 @@ define i1 @func(ptr %0, i64 %1) {
; CHECK-NEXT: [[TMP12]] = add i64 [[TMP4]], 1
; CHECK-NEXT: br label [[TMP3]]
; CHECK: .split.loop.exit:
; CHECK-NEXT: [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ]
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ]
; CHECK-NEXT: [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ]
; CHECK-NEXT: [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0
; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], -1
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]]
; CHECK-NEXT: br label [[TMP17:%.*]]
; CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1
; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]]
; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]]
; CHECK-NEXT: br label [[TMP16:%.*]]
; CHECK: .split.loop.exit2:
; CHECK-NEXT: [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ]
; CHECK-NEXT: [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ]
; CHECK-NEXT: br label [[TMP17]]
; CHECK: 17:
; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
; CHECK-NEXT: [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
; CHECK-NEXT: [[TMP20:%.*]] = xor i1 [[TMP18]], true
; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]]
; CHECK-NEXT: ret i1 [[TMP21]]
; CHECK-NEXT: br label [[TMP16]]
; CHECK: 16:
; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
; CHECK-NEXT: [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
; CHECK-NEXT: [[TMP19:%.*]] = xor i1 [[TMP17]], true
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]]
; CHECK-NEXT: ret i1 [[TMP20]]
;
br label %3