From e301e071ba1aad61e324c5db4129f1deb8f9c273 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 21 Jul 2022 13:14:06 +0100 Subject: [PATCH] [AMDGPU] Remove IR SpeculativeExecution pass from codegen pipeline This pass seems to have very little effect because all it does is hoist some instructions, but it is followed later in the codegen pipeline by the IR CodeSinking pass which does the opposite. Differential Revision: https://reviews.llvm.org/D130258 --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 - .../divergent-branch-uniform-condition.ll | 2 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 3 --- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 24 ++++++++++++------- llvm/test/CodeGen/AMDGPU/select-opt.ll | 2 -- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dca926867300..1faf910c91f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -985,7 +985,6 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { addPass(createLICMPass()); addPass(createSeparateConstOffsetFromGEPPass()); - addPass(createSpeculativeExecutionPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. addPass(createStraightLineStrengthReducePass()); diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index b21a5ae79e02..7ad41bb137f0 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -30,7 +30,6 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: .LBB0_1: ; %Flow1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; ISA-NEXT: s_or_b64 exec, exec, s[6:7] -; ISA-NEXT: s_add_i32 s8, s8, 1 ; ISA-NEXT: s_mov_b64 s[6:7], 0 ; ISA-NEXT: .LBB0_2: ; %Flow ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -54,6 +53,7 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA-NEXT: s_cbranch_execz .LBB0_1 ; ISA-NEXT: ; %bb.5: ; %endif2 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; ISA-NEXT: s_add_i32 s8, s8, 1 ; ISA-NEXT: s_xor_b64 s[4:5], exec, -1 ; ISA-NEXT: s_branch .LBB0_1 ; ISA-NEXT: .LBB0_6: ; %Flow2 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index d67775cc2224..45cd54296e5a 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -451,7 +451,6 @@ ; GCN-O1-OPTS-NEXT: Loop Pass Manager ; GCN-O1-OPTS-NEXT: Loop Invariant Code Motion ; GCN-O1-OPTS-NEXT: Split GEPs to a variadic base and a constant offset for better CSE -; GCN-O1-OPTS-NEXT: Speculatively execute instructions ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis ; GCN-O1-OPTS-NEXT: Straight line strength reduction ; GCN-O1-OPTS-NEXT: Early CSE @@ -741,7 +740,6 @@ ; GCN-O2-NEXT: Loop Pass Manager ; GCN-O2-NEXT: Loop Invariant Code Motion ; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE -; GCN-O2-NEXT: Speculatively execute instructions ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Straight line strength reduction ; GCN-O2-NEXT: Early CSE @@ -1034,7 +1032,6 @@ ; GCN-O3-NEXT: Loop Pass Manager ; GCN-O3-NEXT: Loop Invariant Code Motion ; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE -; GCN-O3-NEXT: Speculatively execute instructions ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Straight line strength reduction ; GCN-O3-NEXT: Phi Values Analysis diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index ff75782d4ae7..f8bf75655658 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -767,7 +767,7 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 ; EG: ; %bb.0: ; %bb ; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[] ; EG-NEXT: JUMP @5 POP:1 -; EG-NEXT: ALU 10, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 14, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0 ; EG-NEXT: POP @5 POP:1 ; EG-NEXT: CF_END @@ -775,8 +775,10 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 ; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0, ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x, -; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x, +; EG-NEXT: MOV T0.X, KC0[3].Y, +; EG-NEXT: MOV * T1.X, KC0[2].W, +; EG-NEXT: LSHL T0.W, PS, literal.x, +; EG-NEXT: LSHL * T1.W, PV.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T1.W, PS, literal.x, ; EG-NEXT: ASHR * T0.W, PV.W, literal.x, @@ -784,14 +786,16 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 ; EG-NEXT: MOV T2.W, KC0[2].Y, ; EG-NEXT: MULLO_INT * T0.X, PS, PV.W, ; EG-NEXT: LSHR T1.X, PV.W, literal.x, -; EG-NEXT: MOV * T0.Y, PS, +; EG-NEXT: MOV T0.Y, PS, +; EG-NEXT: MOV T0.W, KC0[3].X, +; EG-NEXT: MOV * T0.W, KC0[3].Z, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: simplify_i24_crash: ; CM: ; %bb.0: ; %bb ; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[] ; CM-NEXT: JUMP @5 POP:1 -; CM-NEXT: ALU 13, @8, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 17, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X ; CM-NEXT: POP @5 POP:1 ; CM-NEXT: CF_END @@ -799,8 +803,10 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 ; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0, ; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x, -; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x, +; CM-NEXT: MOV * T0.X, KC0[3].Y, +; CM-NEXT: MOV * T1.X, KC0[2].W, +; CM-NEXT: LSHL T0.Z, PV.X, literal.x, +; CM-NEXT: LSHL * T0.W, T0.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; CM-NEXT: MOV T0.Y, KC0[2].Y, ; CM-NEXT: ASHR T1.Z, PV.W, literal.x, @@ -811,7 +817,9 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 ; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z, ; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z, ; CM-NEXT: LSHR T1.X, T0.Y, literal.x, -; CM-NEXT: MOV * T0.Y, PV.X, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: MOV T0.Z, KC0[3].X, +; CM-NEXT: MOV * T0.W, KC0[3].Z, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) bb: %cmp = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll index 0a68d9f1173c..07ccf84c70fe 100644 --- a/llvm/test/CodeGen/AMDGPU/select-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll @@ -143,8 +143,6 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo ; GCN-LABEL: {{^}}regression: ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0 -; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 -; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { entry: