From c9cebda772cbb6cdfc096c10cbc3cc3826801109 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 22 Jul 2021 18:19:54 +0100 Subject: [PATCH] [AArch64] Adjust the cost of integer sum reductions This changes the cost to (LT.first-1) * cost(add) + 2, where the cost of an add is assumed to be 1. This brings it inline with the other reductions. Differential Revision: https://reviews.llvm.org/D106240 --- .../AArch64/AArch64TargetTransformInfo.cpp | 17 ++++++------- .../Analysis/CostModel/AArch64/reduce-add.ll | 24 +++++++++---------- .../CostModel/AArch64/vector-reduce.ll | 10 ++++---- .../SLPVectorizer/AArch64/gather-cost.ll | 2 +- .../SLPVectorizer/AArch64/horizontal.ll | 6 ++--- 5 files changed, 30 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 4ab754465f39..1d2d49b287d4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1966,18 +1966,19 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, assert(ISD && "Invalid opcode"); // Horizontal adds can use the 'addv' instruction. We model the cost of these - // instructions as normal vector adds. This is the only arithmetic vector - // reduction operation for which we have an instruction. + // instructions as twice a normal vector add, plus 1 for each legalization + // step (LT.first). This is the only arithmetic vector reduction operation for + // which we have an instruction. // OR, XOR and AND costs should match the codegen from: // OR: llvm/test/CodeGen/AArch64/reduce-or.ll // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll // AND: llvm/test/CodeGen/AArch64/reduce-and.ll static const CostTblEntry CostTblNoPairwise[]{ - {ISD::ADD, MVT::v8i8, 1}, - {ISD::ADD, MVT::v16i8, 1}, - {ISD::ADD, MVT::v4i16, 1}, - {ISD::ADD, MVT::v8i16, 1}, - {ISD::ADD, MVT::v4i32, 1}, + {ISD::ADD, MVT::v8i8, 2}, + {ISD::ADD, MVT::v16i8, 2}, + {ISD::ADD, MVT::v4i16, 2}, + {ISD::ADD, MVT::v8i16, 2}, + {ISD::ADD, MVT::v4i32, 2}, {ISD::OR, MVT::v8i8, 15}, {ISD::OR, MVT::v16i8, 17}, {ISD::OR, MVT::v4i16, 7}, @@ -2005,7 +2006,7 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, break; case ISD::ADD: if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) - return LT.first * Entry->Cost; + return (LT.first - 1) + Entry->Cost; break; case ISD::XOR: case ISD::AND: diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll index 8bbb96f20dc9..e1f2af680a63 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll @@ -3,20 +3,20 @@ define void @reduce() { ; CHECK-LABEL: 'reduce' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1i8 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1i8 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V3i8 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4i64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll index df25a6fedf2b..4aef33c632dd 100644 --- a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll +++ b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll @@ -3,7 +3,7 @@ define i8 @add.i8.v8i8(<8 x i8> %v) { ; COST-LABEL: 'add.i8.v8i8' -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) +; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) ; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r ; %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) @@ -12,7 +12,7 @@ define i8 @add.i8.v8i8(<8 x i8> %v) { define i8 @add.i8.v16i8(<16 x i8> %v) { ; COST-LABEL: 'add.i8.v16i8' -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) +; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) ; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %r ; %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) @@ -21,7 +21,7 @@ define i8 @add.i8.v16i8(<16 x i8> %v) { define i16 @add.i16.v4i16(<4 x i16> %v) { ; COST-LABEL: 'add.i16.v4i16' -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) +; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) ; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) @@ -30,7 +30,7 @@ define i16 @add.i16.v4i16(<4 x i16> %v) { define i16 @add.i16.v8i16(<8 x i16> %v) { ; COST-LABEL: 'add.i16.v8i16' -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) +; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) ; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) @@ -39,7 +39,7 @@ define i16 @add.i16.v8i16(<8 x i16> %v) { define i32 @add.i32.v4i32(<4 x i32> %v) { ; COST-LABEL: 'add.i32.v4i32' -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) +; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) ; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll index 57db62ace206..22370f55e128 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -10,7 +10,7 @@ target triple = "aarch64--linux-gnu" ; REMARK-LABEL: Function: gather_multiple_use ; REMARK: Args: ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; REMARK-NEXT: - Cost: '-7' +; REMARK-NEXT: - Cost: '-6' ; ; REMARK-NOT: Function: gather_load diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll index 2666a9f3bd6d..a9efaf3b07d6 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -15,7 +15,7 @@ target triple = "aarch64--linux" ; YAML-NEXT: Function: test_select ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-20' +; YAML-NEXT: - Cost: '-19' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '8' @@ -143,7 +143,7 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia ; YAML-NEXT: Function: reduction_with_br ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-11' +; YAML-NEXT: - Cost: '-10' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' ; CHECK-LABEL: @reduction_with_br( @@ -244,7 +244,7 @@ for.end: ; preds = %for.end.loopexit, % ; YAML-NEXT: Function: test_unrolled_select ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-37' +; YAML-NEXT: - Cost: '-36' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '10'