From e9608a84d881e3190d17d8b11f9fdf6d745018ef Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 6 Apr 2021 14:31:24 +0100
Subject: [PATCH] [AMDGPU][SDag] Add IMG init also for image_gather4
 instructions

This fixes an oversight in D99747 which moved the IMG init code from
SIAddIMGInit to AdjustInstrPostInstrSelection, but did not set the
hasPostISelHook flag on gather4 instructions.

Differential Revision: https://reviews.llvm.org/D99953
---
 llvm/lib/Target/AMDGPU/MIMGInstructions.td    |  2 +-
 .../llvm.amdgcn.image.gather4.dim.ll          | 63 +++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.image.gather4.dim.ll   | 16 +++++
 3 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 7a7f397abc61..e438394c18e4 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -851,7 +851,7 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
-      Gather4 = 1, hasPostISelHook = 0 in {
+      Gather4 = 1 in {
     let VDataDwords = 2 in
     defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
     let VDataDwords = 4 in
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
index 6cd67de8de78..5e82ab8c6ab1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@@ -49,6 +49,68 @@ main_body:
   ret <4 x float> %v
 }
 
+define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+; GFX6-LABEL: gather4_2d_tfe:
+; GFX6:       ; %bb.0: ; %main_body
+; GFX6-NEXT:    s_mov_b32 s0, s2
+; GFX6-NEXT:    s_mov_b32 s1, s3
+; GFX6-NEXT:    s_mov_b32 s2, s4
+; GFX6-NEXT:    s_mov_b32 s3, s5
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s7
+; GFX6-NEXT:    s_mov_b32 s6, s8
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    s_mov_b32 s8, s10
+; GFX6-NEXT:    s_mov_b32 s9, s11
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
+; GFX6-NEXT:    s_mov_b32 s10, s12
+; GFX6-NEXT:    s_mov_b32 s11, s13
+; GFX6-NEXT:    s_wqm_b64 exec, exec
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, v0
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
+; GFX6-NEXT:    image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX10NSA-LABEL: gather4_2d_tfe:
+; GFX10NSA:       ; %bb.0: ; %main_body
+; GFX10NSA-NEXT:    s_mov_b32 s28, exec_lo
+; GFX10NSA-NEXT:    s_mov_b32 s0, s2
+; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10NSA-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10NSA-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10NSA-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10NSA-NEXT:    s_mov_b32 s1, s3
+; GFX10NSA-NEXT:    s_mov_b32 s2, s4
+; GFX10NSA-NEXT:    s_mov_b32 s3, s5
+; GFX10NSA-NEXT:    s_mov_b32 s4, s6
+; GFX10NSA-NEXT:    s_mov_b32 s5, s7
+; GFX10NSA-NEXT:    s_mov_b32 s6, s8
+; GFX10NSA-NEXT:    s_mov_b32 s7, s9
+; GFX10NSA-NEXT:    s_mov_b32 s8, s10
+; GFX10NSA-NEXT:    s_mov_b32 s9, s11
+; GFX10NSA-NEXT:    s_mov_b32 s10, s12
+; GFX10NSA-NEXT:    s_mov_b32 s11, s13
+; GFX10NSA-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10NSA-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10NSA-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10NSA-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10NSA-NEXT:    image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
+; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10NSA-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %r = extractvalue { <4 x float>, i32 } %v, 0
+  ret <4 x float> %r
+}
+
 define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
 ; GFX6-LABEL: gather4_cube:
 ; GFX6:       ; %bb.0: ; %main_body
@@ -778,6 +840,7 @@ main_body:
 }
 
 declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
+declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
 declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
 declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
 declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll
index df33d2b082b5..bdf7e4277e52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll
@@ -11,6 +11,21 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}gather4_2d_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: v_mov_b32_e32 v4, v0
+; GFX6789: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe{{$}}
+; GFX10: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ;
+define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+main_body:
+  %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %r = extractvalue { <4 x float>, i32 } %v, 0
+  ret <4 x float> %r
+}
+
 ; GCN-LABEL: {{^}}gather4_cube:
 ; GFX6789: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da{{$}}
 ; GFX10: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ;
@@ -156,6 +171,7 @@ main_body:
 }
 
 declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1