[SLP] Also try to vectorize incoming values of PHIs .

Currently we do not consider incoming values of PHIs as roots for SLP vectorization. This means we miss scenarios like the one in the test case and PR47670. It appears quite straight-forward to consider incoming values of PHIs as roots for vectorization, but I might be missing something that makes this problematic. In terms of vectorized instructions, this applies to quite a few benchmarks across MultiSource/SPEC2000/SPEC2006 on X86 with -O3 -flto Same hash: 185 (filtered out) Remaining: 52 Metric: SLP.NumVectorInstructions Program base patch diff test-suite...ProxyApps-C++/HPCCG/HPCCG.test 9.00 27.00 200.0% test-suite...C/CFP2000/179.art/179.art.test 8.00 22.00 175.0% test-suite...T2006/458.sjeng/458.sjeng.test 14.00 30.00 114.3% test-suite...ce/Benchmarks/PAQ8p/paq8p.test 11.00 18.00 63.6% test-suite...s/FreeBench/neural/neural.test 12.00 18.00 50.0% test-suite...rimaran/enc-3des/enc-3des.test 65.00 95.00 46.2% test-suite...006/450.soplex/450.soplex.test 63.00 89.00 41.3% test-suite...ProxyApps-C++/CLAMR/CLAMR.test 177.00 250.00 41.2% test-suite...nchmarks/McCat/18-imp/imp.test 13.00 18.00 38.5% test-suite.../Applications/sgefa/sgefa.test 26.00 35.00 34.6% test-suite...pplications/oggenc/oggenc.test 100.00 133.00 33.0% test-suite...6/482.sphinx3/482.sphinx3.test 103.00 134.00 30.1% test-suite...oxyApps-C++/miniFE/miniFE.test 169.00 213.00 26.0% test-suite.../Benchmarks/Olden/tsp/tsp.test 59.00 73.00 23.7% test-suite...TimberWolfMC/timberwolfmc.test 503.00 622.00 23.7% test-suite...T2006/456.hmmer/456.hmmer.test 65.00 79.00 21.5% test-suite...libquantum/462.libquantum.test 58.00 68.00 17.2% test-suite...ternal/HMMER/hmmcalibrate.test 84.00 98.00 16.7% test-suite...ications/JM/ldecod/ldecod.test 351.00 401.00 14.2% test-suite...arks/VersaBench/dbms/dbms.test 52.00 57.00 9.6% test-suite...ce/Benchmarks/Olden/bh/bh.test 118.00 128.00 8.5% test-suite.../Benchmarks/Bullet/bullet.test 6355.00 6880.00 8.3% test-suite...nsumer-lame/consumer-lame.test 480.00 519.00 8.1% test-suite...000/183.equake/183.equake.test 226.00 244.00 8.0% test-suite...chmarks/Olden/power/power.test 105.00 113.00 7.6% test-suite...6/471.omnetpp/471.omnetpp.test 92.00 99.00 7.6% test-suite...ications/JM/lencod/lencod.test 1173.00 1261.00 7.5% test-suite...0/253.perlbmk/253.perlbmk.test 55.00 59.00 7.3% test-suite...oxyApps-C/miniAMR/miniAMR.test 92.00 98.00 6.5% test-suite...chmarks/MallocBench/gs/gs.test 446.00 473.00 6.1% test-suite.../CINT2006/403.gcc/403.gcc.test 464.00 491.00 5.8% test-suite...6/464.h264ref/464.h264ref.test 998.00 1055.00 5.7% test-suite...006/453.povray/453.povray.test 5711.00 6007.00 5.2% test-suite...FreeBench/distray/distray.test 102.00 107.00 4.9% test-suite...:: External/Povray/povray.test 4184.00 4378.00 4.6% test-suite...DOE-ProxyApps-C/CoMD/CoMD.test 112.00 117.00 4.5% test-suite...T2006/445.gobmk/445.gobmk.test 104.00 108.00 3.8% test-suite...CI_Purple/SMG2000/smg2000.test 789.00 819.00 3.8% test-suite...yApps-C++/PENNANT/PENNANT.test 233.00 241.00 3.4% test-suite...marks/7zip/7zip-benchmark.test 417.00 428.00 2.6% test-suite...arks/mafft/pairlocalalign.test 627.00 643.00 2.6% test-suite.../Benchmarks/nbench/nbench.test 259.00 265.00 2.3% test-suite...006/447.dealII/447.dealII.test 4641.00 4732.00 2.0% test-suite...lications/ClamAV/clamscan.test 106.00 108.00 1.9% test-suite...CFP2000/177.mesa/177.mesa.test 1639.00 1664.00 1.5% test-suite...oxyApps-C/RSBench/rsbench.test 66.00 65.00 -1.5% test-suite.../CINT2000/252.eon/252.eon.test 3416.00 3444.00 0.8% test-suite...CFP2000/188.ammp/188.ammp.test 1846.00 1861.00 0.8% test-suite.../CINT2000/176.gcc/176.gcc.test 152.00 153.00 0.7% test-suite...CFP2006/444.namd/444.namd.test 3528.00 3544.00 0.5% test-suite...T2006/473.astar/473.astar.test 98.00 98.00 0.0% test-suite...frame_layout/frame_layout.test NaN 39.00 nan% On ARM64, there appears to be a slight regression on SPEC2006, which might be interesting to investigate: test-suite...T2006/473.astar/473.astar.test 0.9% Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D88735
2020-11-06 09:39:55 +00:00 · 2020-11-06 09:39:55 +00:00 · d8d1cc647d
parent 179d91b376
commit d8d1cc647d
2 changed files with 82 additions and 27 deletions
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -7626,16 +7626,27 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
    // Try to vectorize reductions that use PHINodes.
    if (PHINode *P = dyn_cast<PHINode>(it)) {
      // Check that the PHI is a reduction PHI.
-      if (P->getNumIncomingValues() != 2)
-        return Changed;
+      if (P->getNumIncomingValues() == 2) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+                                     TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
+        }
+      }
+      // Try to vectorize the incoming values of the PHI, to catch reductions
+      // that feed into PHIs.
+      for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
+        // Skip if the incoming block is the current BB for now.
+        // TODO: Collect the skipped incoming values and try to vectorize them
+        // after processing BB.
+        if (BB == P->getIncomingBlock(I))
+          continue;

-      // Try to match and vectorize a horizontal reduction.
-      if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
-                                   TTI)) {
-        Changed = true;
-        it = BB->begin();
-        e = BB->end();
-        continue;
+        Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+                                            P->getIncomingBlock(I), R, TTI);
      }
      continue;
    }
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@ -1632,38 +1632,82 @@ define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroe
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4
-; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
-; CHECK-NEXT:    [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]]
+; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
 ; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
-; CHECK-NEXT:    [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]]
 ; CHECK-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
-; CHECK-NEXT:    [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ]
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_1]]
 ;
 ; STORE-LABEL: @reduction_result_used_in_phi(
 ; STORE-NEXT:  entry:
 ; STORE-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
 ; STORE:       bb:
-; STORE-NEXT:    [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4
-; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1
-; STORE-NEXT:    [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4
-; STORE-NEXT:    [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]]
+; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
 ; STORE-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
-; STORE-NEXT:    [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4
-; STORE-NEXT:    [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]]
 ; STORE-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
-; STORE-NEXT:    [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4
-; STORE-NEXT:    [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]]
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
 ; STORE-NEXT:    br label [[EXIT]]
 ; STORE:       exit:
-; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ]
+; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
+; STORE-NEXT:    ret i32 [[SUM_1]]
+;
+entry:
+  br i1 %b, label %bb, label %exit
+
+bb:
+  %l.0 = load i32, i32* %data, align 4
+  %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
+  %l.1 = load i32, i32* %idx.1, align 4
+  %add.1 = add i32 %l.1, %l.0
+  %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
+  %l.2 = load i32, i32* %idx.2, align 4
+  %add.2 = add i32 %l.2, %add.1
+  %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
+  %l.3 = load i32, i32* %idx.3, align 4
+  %add.3 = add i32 %l.3, %add.2
+  br label %exit
+
+exit:
+  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
+  ret i32 %sum.1
+}
+
+define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) {
+; CHECK-LABEL: @reduction_result_used_in_phi_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
+; CHECK-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
+; CHECK-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
+; CHECK-NEXT:    ret i32 [[SUM_1]]
+;
+; STORE-LABEL: @reduction_result_used_in_phi_loop(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
+; STORE:       bb:
+; STORE-NEXT:    [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
+; STORE-NEXT:    [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
+; STORE-NEXT:    [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; STORE-NEXT:    br label [[EXIT]]
+; STORE:       exit:
+; STORE-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
 ; STORE-NEXT:    ret i32 [[SUM_1]]
 ;
 entry: