forked from OSchip/llvm-project
[AMDGPU][SDag] Add IMG init also for image_gather4 instructions
This fixes an oversight in D99747 which moved the IMG init code from SIAddIMGInit to AdjustInstrPostInstrSelection, but did not set the hasPostISelHook flag on gather4 instructions. Differential Revision: https://reviews.llvm.org/D99953
This commit is contained in:
parent
7344f3d39a
commit
e9608a84d8
|
@ -851,7 +851,7 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
|
|||
}
|
||||
|
||||
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
|
||||
Gather4 = 1, hasPostISelHook = 0 in {
|
||||
Gather4 = 1 in {
|
||||
let VDataDwords = 2 in
|
||||
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
|
||||
let VDataDwords = 4 in
|
||||
|
|
|
@ -49,6 +49,68 @@ main_body:
|
|||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
|
||||
; GFX6-LABEL: gather4_2d_tfe:
|
||||
; GFX6: ; %bb.0: ; %main_body
|
||||
; GFX6-NEXT: s_mov_b32 s0, s2
|
||||
; GFX6-NEXT: s_mov_b32 s1, s3
|
||||
; GFX6-NEXT: s_mov_b32 s2, s4
|
||||
; GFX6-NEXT: s_mov_b32 s3, s5
|
||||
; GFX6-NEXT: s_mov_b32 s4, s6
|
||||
; GFX6-NEXT: s_mov_b32 s5, s7
|
||||
; GFX6-NEXT: s_mov_b32 s6, s8
|
||||
; GFX6-NEXT: s_mov_b32 s7, s9
|
||||
; GFX6-NEXT: s_mov_b32 s8, s10
|
||||
; GFX6-NEXT: s_mov_b32 s9, s11
|
||||
; GFX6-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX6-NEXT: s_mov_b32 s10, s12
|
||||
; GFX6-NEXT: s_mov_b32 s11, s13
|
||||
; GFX6-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX6-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v6, v1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX6-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX6-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10NSA-LABEL: gather4_2d_tfe:
|
||||
; GFX10NSA: ; %bb.0: ; %main_body
|
||||
; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX10NSA-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10NSA-NEXT: v_mov_b32_e32 v6, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10NSA-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10NSA-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10NSA-NEXT: s_mov_b32 s6, s8
|
||||
; GFX10NSA-NEXT: s_mov_b32 s7, s9
|
||||
; GFX10NSA-NEXT: s_mov_b32 s8, s10
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10NSA-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10NSA-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX10NSA-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
|
||||
; GFX10NSA-NEXT: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10NSA-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
|
||||
%r = extractvalue { <4 x float>, i32 } %v, 0
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
|
||||
; GFX6-LABEL: gather4_cube:
|
||||
; GFX6: ; %bb.0: ; %main_body
|
||||
|
@ -778,6 +840,7 @@ main_body:
|
|||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
|
||||
declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
|
||||
declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
|
||||
declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
|
||||
declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
|
||||
|
|
|
@ -11,6 +11,21 @@ main_body:
|
|||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gather4_2d_tfe:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v2, v0
|
||||
; GCN: v_mov_b32_e32 v3, v0
|
||||
; GCN: v_mov_b32_e32 v4, v0
|
||||
; GFX6789: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe{{$}}
|
||||
; GFX10: image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ;
|
||||
define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
|
||||
main_body:
|
||||
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%r = extractvalue { <4 x float>, i32 } %v, 0
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gather4_cube:
|
||||
; GFX6789: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da{{$}}
|
||||
; GFX10: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ;
|
||||
|
@ -156,6 +171,7 @@ main_body:
|
|||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
|
||||
|
|
Loading…
Reference in New Issue