[AArch64][CostModel] Detects that {extract,insert}-element at lane 0 has the same cost as the other lane for vector instructions in the IR.

Currently, {extract,insert}-element has zero cost at lane 0 [1]. However, there is a cost (by fmov instruction [2], or ext/ins instruction) to move values from SIMD registers to GPR registers, when the element is used explicitly as integers. See https://godbolt.org/z/faPE1nTn8, when fmov is generated for d* register -> x* register conversion. Implementation-wise, add a private method `AArch64TTIImpl::getVectorInstrCostHelper` as a helper function. This way, instruction-based method could share the core logic (e.g., returning zero cost if type is legalized to scalar). [1] 2cf320d41e/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (L1853) [2] 2cf320d41e/llvm/lib/Target/AArch64/AArch64InstrInfo.td (L8150-L8157) Differential Revision: https://reviews.llvm.org/D128302
2022-06-21 13:38:30 -07:00 · 2022-06-21 13:38:30 -07:00 · 8aa800614b
parent 4d50a39240
commit 8aa800614b
6 changed files with 71 additions and 70 deletions
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -1968,8 +1968,9 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
  return 0;
 }

-InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                                   unsigned Index) {
+InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
+                                                         unsigned Index,
+                                                         bool HasRealUse) {
  assert(Val->isVectorTy() && "This must be a vector type");

  if (Index != -1U) {
@ -1988,7 +1989,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
    }

    // The element at index zero is already inside the vector.
-    if (Index == 0)
+    // - For a physical (HasRealUse==true) insert-element or extract-element
+    // instruction that extracts integers, an explicit FPR -> GPR move is
+    // needed. So it has non-zero cost.
+    // - For the rest of cases (virtual instruction or element type is float),
+    // consider the instruction free.
+    //
+    // FIXME:
+    // If the extract-element and insert-element instructions could be
+    // simplified away (e.g., could be combined into users by looking at use-def
+    // context), they have no cost. This is not done in the first place for
+    // compile-time considerations.
+    if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
      return 0;
  }

@ -1996,6 +2008,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
  return ST->getVectorInsertExtractBaseCost();
 }

+InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                   unsigned Index) {
+  return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
+}
+
+InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
+                                                   Type *Val, unsigned Index) {
+  return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
+}
+
 InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@ -59,6 +59,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
  bool isWideningInstruction(Type *Ty, unsigned Opcode,
                             ArrayRef<const Value *> Args);

+  // A helper function called by 'getVectorInstrCost'.
+  //
+  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
+  // indicates whether the vector instruction is available in the input IR or
+  // just imaginary in vectorizer passes.
+  InstructionCost getVectorInstrCostHelper(Type *Val, unsigned Index,
+                                           bool HasRealUse);
+
 public:
  explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@ -173,9 +181,10 @@ public:
  InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                 const Instruction *I = nullptr);

-  using BaseT::getVectorInstrCost;
  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                     unsigned Index);
+  InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
+                                     unsigned Index);

  InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                         bool IsUnsigned,
--- a/llvm/test/Analysis/CostModel/AArch64/kryo-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/kryo-inseltpoison.ll
@ -6,18 +6,16 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-LABEL: vectorInstrCost
 define void @vectorInstrCost() {

-    ; Vector extracts - extracting the first element should have a zero cost;
-    ; all other elements should have a cost of two.
+    ; Vector extracts - extracting elements should have a cost of two.
    ;
-    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
+    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
    %t1 = extractelement <2 x i64> undef, i32 0
    %t2 = extractelement <2 x i64> undef, i32 1

-    ; Vector inserts - inserting the first element should have a zero cost; all
-    ; other elements should have a cost of two.
+    ; Vector inserts - inserting elements should have a cost of two.
    ;
-    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
+    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 0
    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> poison, i64 undef, i32 1
    %t3 = insertelement <2 x i64> poison, i64 undef, i32 0
    %t4 = insertelement <2 x i64> poison, i64 undef, i32 1
--- a/llvm/test/Analysis/CostModel/AArch64/kryo.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/kryo.ll
@ -6,45 +6,18 @@ target triple = "aarch64--linux-gnu"
 ; CHECK-LABEL: vectorInstrCost
 define void @vectorInstrCost() {

-    ; Vector extracts - extracting the first element should have a zero cost;
-    ; all other elements should have a cost of two.
+    ; Vector extracts - extracting elements should have a cost of two.
    ;
-    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
+    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 0
    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
    %t1 = extractelement <2 x i64> undef, i32 0
    %t2 = extractelement <2 x i64> undef, i32 1

-    ; Vector inserts - inserting the first element should have a zero cost; all
-    ; other elements should have a cost of two.
+    ; Vector inserts - inserting elements should have a cost of two.
    ;
-    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
+    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
    %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
    %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
-
    ret void
 }
-
-; CHECK-LABEL: vectorInstrExtractCost
-define i64 @vectorInstrExtractCost(<4 x i64> %vecreg) {
-    
-    ; Vector extracts - extracting each element at index 0 is considered
-    ; free in the current implementation. When extracting element at index
-    ; 2, 2 is rounded to 0, so extracting element at index 2 has cost 0 as 
-    ; well.
-    ;
-    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 1
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 2
-    %t1 = extractelement <4 x i64> %vecreg, i32 1
-    %t2 = extractelement <4 x i64> %vecreg, i32 2
-    %ele = add i64 %t2, 1
-    %cond = icmp eq i64 %t1, %ele
-
-    ; CHECK: cost of 0 {{.*}} extractelement <4 x i64> %vecreg, i32 0
-    ; CHECK: cost of 2 {{.*}} extractelement <4 x i64> %vecreg, i32 3
-    %t0 = extractelement <4 x i64> %vecreg, i32 0
-    %t3 = extractelement <4 x i64> %vecreg, i32 3
-    %val = select i1 %cond, i64 %t0 , i64 %t3
-
-    ret i64 %val
-}
--- a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll
@ -9,10 +9,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"

 define void @ins_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ins_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@ -27,10 +27,10 @@ define void @ins_el0() #0 {
 ; CHECK-LOW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-HIGH-LABEL: 'ins_el0'
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement <vscale x 16 x i8> zeroinitializer, i8 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement <vscale x 8 x i16> zeroinitializer, i16 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement <vscale x 4 x i32> zeroinitializer, i32 0, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v3 = insertelement <vscale x 2 x i64> zeroinitializer, i64 0, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement <vscale x 4 x float> zeroinitializer, float 0.000000e+00, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement <vscale x 2 x double> zeroinitializer, double 0.000000e+00, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@ -84,10 +84,10 @@ define void @ins_el1() #0 {

 define void @ext_el0() #0 {
 ; CHECK-DEFAULT-LABEL: 'ext_el0'
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
-; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
+; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
 ; CHECK-DEFAULT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@ -102,10 +102,10 @@ define void @ext_el0() #0 {
 ; CHECK-LOW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-HIGH-LABEL: 'ext_el0'
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
-; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement <vscale x 16 x i8> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement <vscale x 8 x i16> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement <vscale x 4 x i32> zeroinitializer, i64 0
+; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 100000 for instruction: %v3 = extractelement <vscale x 2 x i64> zeroinitializer, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement <vscale x 4 x float> zeroinitializer, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement <vscale x 2 x double> zeroinitializer, i64 0
 ; CHECK-HIGH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
--- a/llvm/test/Transforms/LICM/AArch64/extract-element.ll
+++ b/llvm/test/Transforms/LICM/AArch64/extract-element.ll
@ -18,24 +18,23 @@ define i1 @func(ptr %0, i64 %1) {
 ; CHECK-NEXT:    [[TMP12]] = add i64 [[TMP4]], 1
 ; CHECK-NEXT:    br label [[TMP3]]
 ; CHECK:       .split.loop.exit:
-; CHECK-NEXT:    [[DOTLCSSA7:%.*]] = phi <1 x i64> [ [[TMP8]], [[TMP6]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[TMP9]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTLCSSA6:%.*]] = phi i64 [ [[TMP4]], [[TMP6]] ]
 ; CHECK-NEXT:    [[DOTPH:%.*]] = phi i1 [ [[TMP5]], [[TMP6]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[DOTLCSSA7]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[DOTLCSSA6]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp uge i64 [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    br label [[TMP17:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[DOTLCSSA]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], [[DOTLCSSA6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp uge i64 [[TMP14]], [[TMP1]]
+; CHECK-NEXT:    br label [[TMP16:%.*]]
 ; CHECK:       .split.loop.exit2:
 ; CHECK-NEXT:    [[DOTPH3:%.*]] = phi i1 [ [[TMP5]], [[TMP3]] ]
 ; CHECK-NEXT:    [[DOTPH4:%.*]] = phi i1 [ undef, [[TMP3]] ]
-; CHECK-NEXT:    br label [[TMP17]]
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = phi i1 [ [[TMP16]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP18]], true
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP19]]
-; CHECK-NEXT:    ret i1 [[TMP21]]
+; CHECK-NEXT:    br label [[TMP16]]
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[DOTPH]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH3]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[TMP15]], [[DOTSPLIT_LOOP_EXIT]] ], [ [[DOTPH4]], [[DOTSPLIT_LOOP_EXIT2]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP18]]
+; CHECK-NEXT:    ret i1 [[TMP20]]
 ;
  br label %3