forked from OSchip/llvm-project
[AArch64] Adjust the cost of integer sum reductions
This changes the cost to (LT.first-1) * cost(add) + 2, where the cost of an add is assumed to be 1. This brings it inline with the other reductions. Differential Revision: https://reviews.llvm.org/D106240
This commit is contained in:
parent
e1bdb57958
commit
c9cebda772
|
@ -1966,18 +1966,19 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
|
||||||
assert(ISD && "Invalid opcode");
|
assert(ISD && "Invalid opcode");
|
||||||
|
|
||||||
// Horizontal adds can use the 'addv' instruction. We model the cost of these
|
// Horizontal adds can use the 'addv' instruction. We model the cost of these
|
||||||
// instructions as normal vector adds. This is the only arithmetic vector
|
// instructions as twice a normal vector add, plus 1 for each legalization
|
||||||
// reduction operation for which we have an instruction.
|
// step (LT.first). This is the only arithmetic vector reduction operation for
|
||||||
|
// which we have an instruction.
|
||||||
// OR, XOR and AND costs should match the codegen from:
|
// OR, XOR and AND costs should match the codegen from:
|
||||||
// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
|
// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
|
||||||
// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
|
// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
|
||||||
// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
|
// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
|
||||||
static const CostTblEntry CostTblNoPairwise[]{
|
static const CostTblEntry CostTblNoPairwise[]{
|
||||||
{ISD::ADD, MVT::v8i8, 1},
|
{ISD::ADD, MVT::v8i8, 2},
|
||||||
{ISD::ADD, MVT::v16i8, 1},
|
{ISD::ADD, MVT::v16i8, 2},
|
||||||
{ISD::ADD, MVT::v4i16, 1},
|
{ISD::ADD, MVT::v4i16, 2},
|
||||||
{ISD::ADD, MVT::v8i16, 1},
|
{ISD::ADD, MVT::v8i16, 2},
|
||||||
{ISD::ADD, MVT::v4i32, 1},
|
{ISD::ADD, MVT::v4i32, 2},
|
||||||
{ISD::OR, MVT::v8i8, 15},
|
{ISD::OR, MVT::v8i8, 15},
|
||||||
{ISD::OR, MVT::v16i8, 17},
|
{ISD::OR, MVT::v16i8, 17},
|
||||||
{ISD::OR, MVT::v4i16, 7},
|
{ISD::OR, MVT::v4i16, 7},
|
||||||
|
@ -2005,7 +2006,7 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
|
||||||
break;
|
break;
|
||||||
case ISD::ADD:
|
case ISD::ADD:
|
||||||
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
|
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
|
||||||
return LT.first * Entry->Cost;
|
return (LT.first - 1) + Entry->Cost;
|
||||||
break;
|
break;
|
||||||
case ISD::XOR:
|
case ISD::XOR:
|
||||||
case ISD::AND:
|
case ISD::AND:
|
||||||
|
|
|
@ -3,20 +3,20 @@
|
||||||
|
|
||||||
define void @reduce() {
|
define void @reduce() {
|
||||||
; CHECK-LABEL: 'reduce'
|
; CHECK-LABEL: 'reduce'
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1i8 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1i8 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i8 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4i64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4i64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
define i8 @add.i8.v8i8(<8 x i8> %v) {
|
define i8 @add.i8.v8i8(<8 x i8> %v) {
|
||||||
; COST-LABEL: 'add.i8.v8i8'
|
; COST-LABEL: 'add.i8.v8i8'
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
|
; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
|
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
|
||||||
;
|
;
|
||||||
%r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
|
%r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
|
||||||
|
@ -12,7 +12,7 @@ define i8 @add.i8.v8i8(<8 x i8> %v) {
|
||||||
|
|
||||||
define i8 @add.i8.v16i8(<16 x i8> %v) {
|
define i8 @add.i8.v16i8(<16 x i8> %v) {
|
||||||
; COST-LABEL: 'add.i8.v16i8'
|
; COST-LABEL: 'add.i8.v16i8'
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
|
; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
|
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r
|
||||||
;
|
;
|
||||||
%r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
|
%r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
|
||||||
|
@ -21,7 +21,7 @@ define i8 @add.i8.v16i8(<16 x i8> %v) {
|
||||||
|
|
||||||
define i16 @add.i16.v4i16(<4 x i16> %v) {
|
define i16 @add.i16.v4i16(<4 x i16> %v) {
|
||||||
; COST-LABEL: 'add.i16.v4i16'
|
; COST-LABEL: 'add.i16.v4i16'
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
|
; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
|
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
|
||||||
;
|
;
|
||||||
%r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
|
%r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
|
||||||
|
@ -30,7 +30,7 @@ define i16 @add.i16.v4i16(<4 x i16> %v) {
|
||||||
|
|
||||||
define i16 @add.i16.v8i16(<8 x i16> %v) {
|
define i16 @add.i16.v8i16(<8 x i16> %v) {
|
||||||
; COST-LABEL: 'add.i16.v8i16'
|
; COST-LABEL: 'add.i16.v8i16'
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
|
; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
|
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
|
||||||
;
|
;
|
||||||
%r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
|
%r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
|
||||||
|
@ -39,7 +39,7 @@ define i16 @add.i16.v8i16(<8 x i16> %v) {
|
||||||
|
|
||||||
define i32 @add.i32.v4i32(<4 x i32> %v) {
|
define i32 @add.i32.v4i32(<4 x i32> %v) {
|
||||||
; COST-LABEL: 'add.i32.v4i32'
|
; COST-LABEL: 'add.i32.v4i32'
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
|
; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
|
||||||
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
|
; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
|
||||||
;
|
;
|
||||||
%r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
|
%r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
|
||||||
|
|
|
@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu"
|
||||||
; REMARK-LABEL: Function: gather_multiple_use
|
; REMARK-LABEL: Function: gather_multiple_use
|
||||||
; REMARK: Args:
|
; REMARK: Args:
|
||||||
; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
||||||
; REMARK-NEXT: - Cost: '-7'
|
; REMARK-NEXT: - Cost: '-6'
|
||||||
;
|
;
|
||||||
; REMARK-NOT: Function: gather_load
|
; REMARK-NOT: Function: gather_load
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ target triple = "aarch64--linux"
|
||||||
; YAML-NEXT: Function: test_select
|
; YAML-NEXT: Function: test_select
|
||||||
; YAML-NEXT: Args:
|
; YAML-NEXT: Args:
|
||||||
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
||||||
; YAML-NEXT: - Cost: '-20'
|
; YAML-NEXT: - Cost: '-19'
|
||||||
; YAML-NEXT: - String: ' and with tree size '
|
; YAML-NEXT: - String: ' and with tree size '
|
||||||
; YAML-NEXT: - TreeSize: '8'
|
; YAML-NEXT: - TreeSize: '8'
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
|
||||||
; YAML-NEXT: Function: reduction_with_br
|
; YAML-NEXT: Function: reduction_with_br
|
||||||
; YAML-NEXT: Args:
|
; YAML-NEXT: Args:
|
||||||
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
||||||
; YAML-NEXT: - Cost: '-11'
|
; YAML-NEXT: - Cost: '-10'
|
||||||
; YAML-NEXT: - String: ' and with tree size '
|
; YAML-NEXT: - String: ' and with tree size '
|
||||||
; YAML-NEXT: - TreeSize: '3'
|
; YAML-NEXT: - TreeSize: '3'
|
||||||
; CHECK-LABEL: @reduction_with_br(
|
; CHECK-LABEL: @reduction_with_br(
|
||||||
|
@ -244,7 +244,7 @@ for.end: ; preds = %for.end.loopexit, %
|
||||||
; YAML-NEXT: Function: test_unrolled_select
|
; YAML-NEXT: Function: test_unrolled_select
|
||||||
; YAML-NEXT: Args:
|
; YAML-NEXT: Args:
|
||||||
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
||||||
; YAML-NEXT: - Cost: '-37'
|
; YAML-NEXT: - Cost: '-36'
|
||||||
; YAML-NEXT: - String: ' and with tree size '
|
; YAML-NEXT: - String: ' and with tree size '
|
||||||
; YAML-NEXT: - TreeSize: '10'
|
; YAML-NEXT: - TreeSize: '10'
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue