[AArch64] Use first op of FADDPv* instead of implicit def.

This patch updates the FADDPv* patterns that only use the lower half of the result register. For those patterns, the second operand does not matter because its results won't be used. Instead of introducing new implicit defs for those operands, just use the first operand. The problem with using new implicit defs is that register allocation can introduce unnecessary dependencies by using a different register than the first operand. For motivating cases, see the changes in the fadd_reduction_*_in_loop cases. Without this change, the first faddp in the loop has an unnecessary additional dependency through v0, which is also used for a cross-iteration reduction. This can noticeable impact performance. For slightly bigger loops, this change can improve performance by 15%. Reviewed By: sdesmalen, t.p.northover Differential Revision: https://reviews.llvm.org/D120706
2022-03-03 13:32:08 +00:00 · 2022-03-03 13:32:08 +00:00 · 0f261256e0
parent 28ccf32672
commit 0f261256e0
2 changed files with 14 additions and 11 deletions
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@ -5345,19 +5345,22 @@ defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
 defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
 defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;

+// Only the lower half of the result of the inner FADDP is used in the patterns
+// below, so the second operand does not matter. Re-use the first input
+// operand, so no additional dependencies need to be introduced.
 let Predicates = [HasFullFP16] in {
 def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))),
            (FADDPv2i16p
              (EXTRACT_SUBREG
-                 (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))),
+                 (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), V128:$Rn),
               dsub))>;
 def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))),
-          (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>;
+          (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>;
 }
 def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))),
          (FADDPv2i32p
            (EXTRACT_SUBREG
-              (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))),
+              (FADDPv4f32 V128:$Rn, V128:$Rn),
             dsub))>;
 def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))),
          (FADDPv2i32p V64:$Rn)>;
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@ -45,8 +45,8 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 define half @add_H(<8 x half> %bin.rdx)  {
 ; FULLFP16-LABEL: add_H:
 ; FULLFP16:       // %bb.0:
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
 ; FULLFP16-NEXT:    faddp h0, v0.2h
 ; FULLFP16-NEXT:    ret
 ;
@ -115,8 +115,8 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ; FULLFP16-LABEL: add_2H:
 ; FULLFP16:       // %bb.0:
 ; FULLFP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    faddp v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
 ; FULLFP16-NEXT:    faddp h0, v0.2h
 ; FULLFP16-NEXT:    ret
 ;
@ -248,7 +248,7 @@ define float @fadd_reduction_v4f32_in_loop(float* %ptr.start) {
 ; CHECK-NEXT:    ldr q1, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp w8, #112
-; CHECK-NEXT:    faddp v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    faddp v1.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    faddp s1, v1.2s
 ; CHECK-NEXT:    fadd s0, s1, s0
 ; CHECK-NEXT:    b.ne .LBB9_1
@ -286,7 +286,7 @@ define half @fadd_reduction_v4f16_in_loop(half* %ptr.start) {
 ; FULLFP16-NEXT:    ldr d1, [x0, x8]
 ; FULLFP16-NEXT:    add x8, x8, #8
 ; FULLFP16-NEXT:    cmp w8, #56
-; FULLFP16-NEXT:    faddp v1.4h, v1.4h, v0.4h
+; FULLFP16-NEXT:    faddp v1.4h, v1.4h, v1.4h
 ; FULLFP16-NEXT:    faddp h1, v1.2h
 ; FULLFP16-NEXT:    fadd h0, h1, h0
 ; FULLFP16-NEXT:    b.ne .LBB10_1
@ -357,8 +357,8 @@ define half @fadd_reduction_v8f16_in_loop(half* %ptr.start) {
 ; FULLFP16-NEXT:    ldr q1, [x0, x8]
 ; FULLFP16-NEXT:    add x8, x8, #8
 ; FULLFP16-NEXT:    cmp w8, #56
-; FULLFP16-NEXT:    faddp v1.8h, v1.8h, v0.8h
-; FULLFP16-NEXT:    faddp v1.8h, v1.8h, v0.8h
+; FULLFP16-NEXT:    faddp v2.8h, v1.8h, v1.8h
+; FULLFP16-NEXT:    faddp v1.8h, v2.8h, v1.8h
 ; FULLFP16-NEXT:    faddp h1, v1.2h
 ; FULLFP16-NEXT:    fadd h0, h1, h0
 ; FULLFP16-NEXT:    b.ne .LBB11_1