From e301e071ba1aad61e324c5db4129f1deb8f9c273 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 21 Jul 2022 13:14:06 +0100
Subject: [PATCH] [AMDGPU] Remove IR SpeculativeExecution pass from codegen
 pipeline

This pass seems to have very little effect because all it does is hoist
some instructions, but it is followed later in the codegen pipeline by
the IR CodeSinking pass which does the opposite.

Differential Revision: https://reviews.llvm.org/D130258
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  1 -
 .../divergent-branch-uniform-condition.ll     |  2 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  3 ---
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         | 24 ++++++++++++-------
 llvm/test/CodeGen/AMDGPU/select-opt.ll        |  2 --
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dca926867300..1faf910c91f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -985,7 +985,6 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
   addPass(createLICMPass());
   addPass(createSeparateConstOffsetFromGEPPass());
-  addPass(createSpeculativeExecutionPass());
   // ReassociateGEPs exposes more opportunities for SLSR. See
   // the example in reassociate-geps-and-slsr.ll.
   addPass(createStraightLineStrengthReducePass());
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index b21a5ae79e02..7ad41bb137f0 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -30,7 +30,6 @@ define amdgpu_ps void @main(i32 %0, float %1) {
 ; ISA-NEXT:  .LBB0_1: ; %Flow1
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; ISA-NEXT:    s_or_b64 exec, exec, s[6:7]
-; ISA-NEXT:    s_add_i32 s8, s8, 1
 ; ISA-NEXT:    s_mov_b64 s[6:7], 0
 ; ISA-NEXT:  .LBB0_2: ; %Flow
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
@@ -54,6 +53,7 @@ define amdgpu_ps void @main(i32 %0, float %1) {
 ; ISA-NEXT:    s_cbranch_execz .LBB0_1
 ; ISA-NEXT:  ; %bb.5: ; %endif2
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_add_i32 s8, s8, 1
 ; ISA-NEXT:    s_xor_b64 s[4:5], exec, -1
 ; ISA-NEXT:    s_branch .LBB0_1
 ; ISA-NEXT:  .LBB0_6: ; %Flow2
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d67775cc2224..45cd54296e5a 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -451,7 +451,6 @@
 ; GCN-O1-OPTS-NEXT:      Loop Pass Manager
 ; GCN-O1-OPTS-NEXT:        Loop Invariant Code Motion
 ; GCN-O1-OPTS-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
-; GCN-O1-OPTS-NEXT:      Speculatively execute instructions
 ; GCN-O1-OPTS-NEXT:      Scalar Evolution Analysis
 ; GCN-O1-OPTS-NEXT:      Straight line strength reduction
 ; GCN-O1-OPTS-NEXT:      Early CSE
@@ -741,7 +740,6 @@
 ; GCN-O2-NEXT:      Loop Pass Manager
 ; GCN-O2-NEXT:        Loop Invariant Code Motion
 ; GCN-O2-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
-; GCN-O2-NEXT:      Speculatively execute instructions
 ; GCN-O2-NEXT:      Scalar Evolution Analysis
 ; GCN-O2-NEXT:      Straight line strength reduction
 ; GCN-O2-NEXT:      Early CSE
@@ -1034,7 +1032,6 @@
 ; GCN-O3-NEXT:      Loop Pass Manager
 ; GCN-O3-NEXT:        Loop Invariant Code Motion
 ; GCN-O3-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
-; GCN-O3-NEXT:      Speculatively execute instructions
 ; GCN-O3-NEXT:      Scalar Evolution Analysis
 ; GCN-O3-NEXT:      Straight line strength reduction
 ; GCN-O3-NEXT:      Phi Values Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index ff75782d4ae7..f8bf75655658 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -767,7 +767,7 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; EG:       ; %bb.0: ; %bb
 ; EG-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    JUMP @5 POP:1
-; EG-NEXT:    ALU 10, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 14, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
 ; EG-NEXT:    POP @5 POP:1
 ; EG-NEXT:    CF_END
@@ -775,8 +775,10 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; EG-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
 ; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
-; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT:     MOV T0.X, KC0[3].Y,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:     LSHL T0.W, PS, literal.x,
+; EG-NEXT:     LSHL * T1.W, PV.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; EG-NEXT:     ASHR T1.W, PS, literal.x,
 ; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
@@ -784,14 +786,16 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; EG-NEXT:     MOV T2.W, KC0[2].Y,
 ; EG-NEXT:     MULLO_INT * T0.X, PS, PV.W,
 ; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
-; EG-NEXT:     MOV * T0.Y, PS,
+; EG-NEXT:     MOV T0.Y, PS,
+; EG-NEXT:     MOV T0.W, KC0[3].X,
+; EG-NEXT:     MOV * T0.W, KC0[3].Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: simplify_i24_crash:
 ; CM:       ; %bb.0: ; %bb
 ; CM-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    JUMP @5 POP:1
-; CM-NEXT:    ALU 13, @8, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 17, @8, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    POP @5 POP:1
 ; CM-NEXT:    CF_END
@@ -799,8 +803,10 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; CM-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
 ; CM-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
 ; CM-NEXT:    ALU clause starting at 8:
-; CM-NEXT:     LSHL T0.Z, KC0[2].W, literal.x,
-; CM-NEXT:     LSHL * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT:     MOV * T0.X, KC0[3].Y,
+; CM-NEXT:     MOV * T1.X, KC0[2].W,
+; CM-NEXT:     LSHL T0.Z, PV.X, literal.x,
+; CM-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ; CM-NEXT:     MOV T0.Y, KC0[2].Y,
 ; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
@@ -811,7 +817,9 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
 ; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
 ; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
 ; CM-NEXT:     LSHR T1.X, T0.Y, literal.x,
-; CM-NEXT:     MOV * T0.Y, PV.X,
+; CM-NEXT:     MOV T0.Y, PV.X,
+; CM-NEXT:     MOV T0.Z, KC0[3].X,
+; CM-NEXT:     MOV * T0.W, KC0[3].Z,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 bb:
   %cmp = icmp eq i32 %arg0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 0a68d9f1173c..07ccf84c70fe 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -143,8 +143,6 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo
 
 ; GCN-LABEL: {{^}}regression:
 ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
-; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0
-; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0
 
 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry: