From 6c289a1c744802c317986989915cc328d25c210d Mon Sep 17 00:00:00 2001
From: Matthew Simpson <mssimpso@codeaurora.org>
Date: Fri, 23 Mar 2018 14:18:27 +0000
Subject: [PATCH] [SLP] Stop counting cost of gather sequences with multiple
 uses

When building the SLP tree, we look for reuse among the vectorized tree
entries. However, each gather sequence is represented by a unique tree entry,
even though the sequence may be identical to another one. This means, for
example, that a gather sequence with two uses will be counted twice when
computing the cost of the tree. We should only count the cost of the definition
of a gather sequence rather than its uses. During code generation, the
redundant gather sequences are emitted, but we optimize them away with CSE. So
it looks like this problem just affects the cost model.

Differential Revision: https://reviews.llvm.org/D44742

llvm-svn: 328316
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 23 ++++++++++-
 .../SLPVectorizer/AArch64/gather-cost.ll      | 40 ++++++-------------
 .../SLPVectorizer/X86/blending-shuffle.ll     | 15 +++----
 3 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b21a9e0250ae..af74acb41504 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2457,7 +2457,28 @@ int BoUpSLP::getTreeCost() {
 
   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
 
-  for (TreeEntry &TE : VectorizableTree) {
+  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+    TreeEntry &TE = VectorizableTree[I];
+
+    // We create duplicate tree entries for gather sequences that have multiple
+    // uses. However, we should not compute the cost of duplicate sequences.
+    // For example, if we have a build vector (i.e., insertelement sequence)
+    // that is used by more than one vector instruction, we only need to
+    // compute the cost of the insertelement instructions once. The redundent
+    // instructions will be eliminated by CSE.
+    //
+    // We should consider not creating duplicate tree entries for gather
+    // sequences, and instead add additional edges to the tree representing
+    // their uses. Since such an approach results in fewer total entries,
+    // existing heuristics based on tree size may yeild different results.
+    //
+    if (TE.NeedToGather &&
+        std::any_of(std::next(VectorizableTree.begin(), I + 1),
+                    VectorizableTree.end(), [TE](TreeEntry &Entry) {
+                      return Entry.NeedToGather && Entry.isSame(TE.Scalars);
+                    }))
+      continue;
+
     int C = getEntryCost(&TE);
     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
                  << *TE.Scalars[0] << ".\n");
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index aec33c57160a..a4c655ca3cb0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -7,36 +7,22 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
 ; CHECK-LABEL:  @gather_multiple_use(
-; CHECK-NEXT:     [[TMP00:%.*]] = lshr i32 [[A:%.*]], 15
-; CHECK-NEXT:     [[TMP01:%.*]] = and i32 [[TMP00]], 65537
-; CHECK-NEXT:     [[TMP02:%.*]] = mul nuw i32 [[TMP01]], 65535
-; CHECK-NEXT:     [[TMP03:%.*]] = add i32 [[TMP02]], [[A]]
-; CHECK-NEXT:     [[TMP04:%.*]] = xor i32 [[TMP03]], [[TMP02]]
-; CHECK-NEXT:     [[TMP05:%.*]] = lshr i32 [[C:%.*]], 15
-; CHECK-NEXT:     [[TMP06:%.*]] = and i32 [[TMP05]], 65537
-; CHECK-NEXT:     [[TMP07:%.*]] = mul nuw i32 [[TMP06]], 65535
-; CHECK-NEXT:     [[TMP08:%.*]] = add i32 [[TMP07]], [[C]]
-; CHECK-NEXT:     [[TMP09:%.*]] = xor i32 [[TMP08]], [[TMP07]]
-; CHECK-NEXT:     [[TMP10:%.*]] = lshr i32 [[B:%.*]], 15
-; CHECK-NEXT:     [[TMP11:%.*]] = and i32 [[TMP10]], 65537
-; CHECK-NEXT:     [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 65535
-; CHECK-NEXT:     [[TMP13:%.*]] = add i32 [[TMP12]], [[B]]
-; CHECK-NEXT:     [[TMP14:%.*]] = xor i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:     [[TMP15:%.*]] = lshr i32 [[D:%.*]], 15
-; CHECK-NEXT:     [[TMP16:%.*]] = and i32 [[TMP15]], 65537
-; CHECK-NEXT:     [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 65535
-; CHECK-NEXT:     [[TMP18:%.*]] = add i32 [[TMP17]], [[D]]
-; CHECK-NEXT:     [[TMP19:%.*]] = xor i32 [[TMP18]], [[TMP17]]
-; CHECK-NEXT:     [[TMP20:%.*]] = add i32 [[TMP09]], [[TMP04]]
-; CHECK-NEXT:     [[TMP21:%.*]] = add i32 [[TMP20]], [[TMP14]]
-; CHECK-NEXT:     [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP19]]
-; CHECK-NEXT:     ret i32 [[TMP22]]
+; CHECK-NEXT:     [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
+; CHECK-NEXT:     [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:     [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2
+; CHECK-NEXT:     [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3
+; CHECK-NEXT:     [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:     [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:     [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:     [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:     [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:     [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:     ret i32 [[TMP10]]
 ;
 ; REMARK-LABEL: Function: gather_multiple_use
 ; REMARK:       Args:
-; REMARK-NEXT:    - String: Vectorizing horizontal reduction is possible
-; REMARK-NEXT:    - String: 'but not beneficial with cost '
-; REMARK-NEXT:    - Cost: '2'
+; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
+; REMARK-NEXT:    - Cost: '-7'
 ;
 define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
   %tmp00 = lshr i32 %a, 15
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 00ac87d6a997..54dbc13934e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -141,15 +141,16 @@ define i8 @k_bb(<4 x i8> %x) {
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
 ; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
 ; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    ret i8 [[TMP6]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   br label %bb1