2018-12-01 02:29:17 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GFX9 %s
|
2018-11-16 09:13:34 +08:00
|
|
|
|
|
|
|
; Test for a conv2d like sequence of loads.
|
|
|
|
|
[AMDGPU] Precommit some scheduler related test updates
Summary:
The point of this is to make some tests with manual checks robust
against scheduler tweaks, so that only autogenerated test updates will
be required when pushing D68338 "[AMDGPU] Remove dubious logic in
bidirectional list scheduler".
Reviewers: arsenm, rampitec, vpykhtin
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75302
2020-02-28 05:16:46 +08:00
|
|
|
; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
|
|
|
|
; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
|
|
|
|
; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
|
|
|
|
; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}}
|
|
|
|
; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}}
|
2018-11-16 09:13:34 +08:00
|
|
|
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
|
|
|
|
|
|
|
|
define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) {
|
|
|
|
entry:
|
|
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
%idx = zext i32 %id to i64
|
|
|
|
%gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx
|
|
|
|
%ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1
|
|
|
|
%load0 = load i64, i64 addrspace(1)* %ptr0
|
|
|
|
%ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2
|
|
|
|
%load1 = load i64, i64 addrspace(1)* %ptr1
|
|
|
|
%ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3
|
|
|
|
%load2 = load i64, i64 addrspace(1)* %ptr2
|
|
|
|
%ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4
|
|
|
|
%load3 = load i64, i64 addrspace(1)* %ptr3
|
|
|
|
%ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4
|
|
|
|
%load4 = load i64, i64 addrspace(1)* %ptr4
|
|
|
|
%ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3
|
|
|
|
%load5 = load i64, i64 addrspace(1)* %ptr5
|
|
|
|
%ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2
|
|
|
|
%load6 = load i64, i64 addrspace(1)* %ptr6
|
|
|
|
%ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1
|
|
|
|
%load7 = load i64, i64 addrspace(1)* %ptr7
|
|
|
|
%ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0
|
|
|
|
%load8 = load i64, i64 addrspace(1)* %ptr8
|
|
|
|
%add0 = add i64 %load1, %load0
|
|
|
|
%add1 = add i64 %load3, %load2
|
|
|
|
%add2 = add i64 %load5, %load4
|
|
|
|
%add3 = add i64 %load7, %load6
|
|
|
|
%add4 = add i64 %add0, %load8
|
|
|
|
%add5 = add i64 %add2, %add1
|
|
|
|
%add6 = add i64 %add4, %add3
|
|
|
|
%add7 = add i64 %add6, %add5
|
|
|
|
%gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx
|
|
|
|
%ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1
|
|
|
|
store volatile i64 %add7, i64 addrspace(1)* %ptr9
|
|
|
|
|
|
|
|
; Test various offset boundaries.
|
2019-10-21 01:34:44 +08:00
|
|
|
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}}
|
[AMDGPU] Cluster FLAT instructions with both vaddr and saddr
Reviewers: rampitec, arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D73634
2020-01-06 23:33:15 +08:00
|
|
|
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
|
[AMDGPU/MemOpsCluster] Implement new heuristic for computing max mem ops cluster size
Summary:
Make use of both the - (1) clustered bytes and (2) cluster length, to decide on
the max number of mem ops that can be clustered. On an average, when loads
are dword or smaller, consider `5` as max threshold, otherwise `4`. This
heuristic is purely based on different experimentation conducted, and there is
no analytical logic here.
Reviewers: foad, rampitec, arsenm, vpykhtin
Reviewed By: rampitec
Subscribers: llvm-commits, kerbowa, hiraditya, t-tye, Anastasia, tpr, dstuttard, yaxunl, nhaehnle, wdng, jvesely, kzhuravl, thakis
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D82393
2020-06-24 03:08:56 +08:00
|
|
|
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2056{{$}}
|
2018-11-16 09:13:34 +08:00
|
|
|
%gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
|
|
|
|
%load11 = load i64, i64 addrspace(1)* %gep11
|
|
|
|
%gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023
|
|
|
|
%load12 = load i64, i64 addrspace(1)* %gep12
|
|
|
|
%gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
|
|
|
|
%load13 = load i64, i64 addrspace(1)* %gep13
|
|
|
|
%add11 = add i64 %load11, %load12
|
|
|
|
%add12 = add i64 %add11, %load13
|
|
|
|
store volatile i64 %add12, i64 addrspace(1)* undef
|
|
|
|
|
|
|
|
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
|
|
|
|
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
|
|
|
|
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
|
|
|
|
%gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024
|
|
|
|
%load21 = load i64, i64 addrspace(1)* %gep21
|
|
|
|
%gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048
|
|
|
|
%load22 = load i64, i64 addrspace(1)* %gep22
|
|
|
|
%gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512
|
|
|
|
%load23 = load i64, i64 addrspace(1)* %gep23
|
|
|
|
%add21 = add i64 %load22, %load21
|
|
|
|
%add22 = add i64 %add21, %load23
|
|
|
|
store volatile i64 %add22, i64 addrspace(1)* undef
|
|
|
|
|
|
|
|
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
|
|
|
|
%gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257
|
|
|
|
%load31 = load i64, i64 addrspace(1)* %gep31
|
|
|
|
%gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256
|
|
|
|
%load32 = load i64, i64 addrspace(1)* %gep32
|
|
|
|
%gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
|
|
|
|
%load33 = load i64, i64 addrspace(1)* %gep33
|
|
|
|
%add34 = add i64 %load32, %load31
|
|
|
|
%add35 = add i64 %add34, %load33
|
|
|
|
store volatile i64 %add35, i64 addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2020-05-31 07:37:57 +08:00
|
|
|
; GFX9-LABEL: {{^}}_amdgpu_cs_main:
|
2018-11-16 09:13:34 +08:00
|
|
|
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
|
|
|
|
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}}
|
|
|
|
; GFX9-NEXT: s_waitcnt
|
2020-02-26 17:46:07 +08:00
|
|
|
; GFX9-NOT: global_load_dword
|
2018-11-16 09:13:34 +08:00
|
|
|
|
|
|
|
define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) {
|
|
|
|
bb:
|
|
|
|
%tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)*
|
2020-05-31 07:37:57 +08:00
|
|
|
%tmp2 = load volatile <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16
|
2018-11-16 09:13:34 +08:00
|
|
|
store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
|
|
attributes #0 = { convergent nounwind }
|
|
|
|
attributes #1 = { nounwind readnone speculatable }
|