forked from OSchip/llvm-project
AMDGPU: Do not combine loads/store across physreg defs
Summary: Since this pass operates on machine SSA form, this should only really affect M0 in practice. Fixes various piglit variable-indexing/vs-varying-array-mat4-index-* Change-Id: Ib2a1dc3a8d7b08225a8da49a86f533faa0986aa8 Fixes: r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4") Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40343 llvm-svn: 325677
This commit is contained in:
parent
d6e1a9404d
commit
770397f4cd
|
@ -228,6 +228,16 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
hasPhysRegDef(MachineInstr &MI) {
|
||||
for (const MachineOperand &Def : MI.defs()) {
|
||||
if (Def.isReg() &&
|
||||
TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
|
||||
// XXX - Would the same offset be OK? Is there any reason this would happen or
|
||||
// be useful?
|
||||
|
@ -350,6 +360,13 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (hasPhysRegDef(*MBBI)) {
|
||||
// We could re-order this instruction in theory, but it would require
|
||||
// tracking physreg defs and uses. This should only affect M0 in
|
||||
// practice.
|
||||
return false;
|
||||
}
|
||||
|
||||
if (MBBI->mayLoadOrStore() &&
|
||||
(!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
|
||||
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
|
||||
|
@ -437,7 +454,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
// down past this instruction.
|
||||
// check if we can move I across MBBI and if we can move all I's users
|
||||
if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
|
||||
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
|
||||
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
|
||||
hasPhysRegDef(*MBBI))
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
|
|
|
@ -613,6 +613,24 @@ bb:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: ds_read_call_read:
|
||||
; GCN: ds_read_b32
|
||||
; GCN: s_swappc_b64
|
||||
; GCN: ds_read_b32
|
||||
define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) {
|
||||
%x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x
|
||||
%arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1
|
||||
%v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4
|
||||
call void @void_func_void()
|
||||
%v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4
|
||||
%r = add i32 %v0, %v1
|
||||
store i32 %r, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @void_func_void() #3
|
||||
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #1
|
||||
declare i32 @llvm.amdgcn.workgroup.id.y() #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -623,3 +641,4 @@ declare void @llvm.amdgcn.s.barrier() #2
|
|||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone speculatable }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind noinline }
|
||||
|
|
|
@ -160,21 +160,25 @@ bb:
|
|||
|
||||
; SI won't merge ds memory operations, because of the signed offset bug, so
|
||||
; we only have check lines for VI.
|
||||
; VI-LABEL: v_interp_readnone:
|
||||
; VI: s_mov_b32 m0, 0
|
||||
; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
|
||||
; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
|
||||
; VI: s_mov_b32 m0, -1{{$}}
|
||||
; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
|
||||
define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
store float 0.000000e+00, float addrspace(3)* %lds
|
||||
%tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
|
||||
%tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
|
||||
store float 0.000000e+00, float addrspace(3)* %tmp2
|
||||
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
|
||||
ret void
|
||||
}
|
||||
;
|
||||
; TODO: VI won't merge them either, because we are conservative about moving
|
||||
; instructions past changes to physregs.
|
||||
;
|
||||
; TODO-VI-LABEL: v_interp_readnone:
|
||||
; TODO-VI: s_mov_b32 m0, 0
|
||||
; TODO-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
|
||||
; TODO-VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
|
||||
; TODO-VI: s_mov_b32 m0, -1{{$}}
|
||||
; TODO-VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
|
||||
;define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
|
||||
;bb:
|
||||
; store float 0.000000e+00, float addrspace(3)* %lds
|
||||
; %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
|
||||
; %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
|
||||
; store float 0.000000e+00, float addrspace(3)* %tmp2
|
||||
; call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
|
||||
; ret void
|
||||
;}
|
||||
|
||||
; Thest that v_interp_p1 uses different source and destination registers
|
||||
; on 16 bank LDS chips.
|
||||
|
|
|
@ -232,6 +232,48 @@ main_body:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}smrd_imm_nomerge_m0:
|
||||
;
|
||||
; In principle we could merge the loads here as well, but it would require
|
||||
; careful tracking of physical registers since both v_interp* and v_movrel*
|
||||
; instructions (or gpr idx mode) use M0.
|
||||
;
|
||||
; GCN: s_buffer_load_dword
|
||||
; GCN: s_buffer_load_dword
|
||||
define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
|
||||
main_body:
|
||||
%idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
|
||||
%idx1 = bitcast float %idx1.f to i32
|
||||
|
||||
%v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim)
|
||||
%v0.x = call nsz float @llvm.amdgcn.interp.p2(float %v0.x1, float %v, i32 0, i32 0, i32 %prim)
|
||||
%v0.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 1, i32 %prim)
|
||||
%v0.y = call nsz float @llvm.amdgcn.interp.p2(float %v0.y1, float %v, i32 0, i32 1, i32 %prim)
|
||||
%v0.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 2, i32 %prim)
|
||||
%v0.z = call nsz float @llvm.amdgcn.interp.p2(float %v0.z1, float %v, i32 0, i32 2, i32 %prim)
|
||||
%v0.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
|
||||
%v0.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
|
||||
%v0 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
|
||||
%a = extractelement <3 x float> %v0, i32 %idx1
|
||||
|
||||
%v1.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 0, i32 %prim)
|
||||
%v1.x = call nsz float @llvm.amdgcn.interp.p2(float %v1.x1, float %v, i32 1, i32 0, i32 %prim)
|
||||
%v1.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 1, i32 %prim)
|
||||
%v1.y = call nsz float @llvm.amdgcn.interp.p2(float %v1.y1, float %v, i32 1, i32 1, i32 %prim)
|
||||
%v1.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 2, i32 %prim)
|
||||
%v1.z = call nsz float @llvm.amdgcn.interp.p2(float %v1.z1, float %v, i32 1, i32 2, i32 %prim)
|
||||
%v1.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
|
||||
%v1.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
|
||||
%v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
|
||||
|
||||
%b = extractelement <3 x float> %v1, i32 %idx1
|
||||
%c = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
|
||||
|
||||
%res.tmp = fadd float %a, %b
|
||||
%res = fadd float %res.tmp, %c
|
||||
ret float %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}smrd_vgpr_merged:
|
||||
; GCN-NEXT: %bb.
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
|
||||
|
@ -289,8 +331,11 @@ ret_block: ; preds = %.outer, %.label22, %
|
|||
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
|
||||
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
|
||||
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
|
||||
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { nounwind readnone speculatable }
|
||||
|
||||
!0 = !{}
|
||||
|
|
Loading…
Reference in New Issue