2019-10-16 00:41:15 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
|
|
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
|
|
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
|
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic
Summary:
Now that we've made all the necessary backend changes, we can add a new
intrinsic which exposes the new capabilities to IR producers. Since
llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we
should deprecate the former. We also add tests for all the functionality
that was added in previous changes, now that we can access it via an IR
construct.
Reviewers: tstellar, arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D34718
llvm-svn: 310399
2017-08-09 02:52:22 +08:00
|
|
|
|
2019-06-13 02:02:41 +08:00
|
|
|
; GCN-LABEL: {{^}}dpp_test:
|
|
|
|
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
|
|
|
|
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
|
2020-10-20 05:38:02 +08:00
|
|
|
; GFX8: s_nop 1
|
2019-06-13 02:02:41 +08:00
|
|
|
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic
Summary:
Now that we've made all the necessary backend changes, we can add a new
intrinsic which exposes the new capabilities to IR producers. Since
llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we
should deprecate the former. We also add tests for all the functionality
that was added in previous changes, now that we can access it via an IR
construct.
Reviewers: tstellar, arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D34718
llvm-svn: 310399
2017-08-09 02:52:22 +08:00
|
|
|
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
|
2019-06-13 02:02:41 +08:00
|
|
|
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0
|
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic
Summary:
Now that we've made all the necessary backend changes, we can add a new
intrinsic which exposes the new capabilities to IR producers. Since
llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we
should deprecate the former. We also add tests for all the functionality
that was added in previous changes, now that we can access it via an IR
construct.
Reviewers: tstellar, arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D34718
llvm-svn: 310399
2017-08-09 02:52:22 +08:00
|
|
|
store i32 %tmp0, i32 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-06-13 02:02:41 +08:00
|
|
|
; GCN-LABEL: {{^}}dpp_test_bc:
|
|
|
|
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
|
|
|
|
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
|
2020-10-20 05:38:02 +08:00
|
|
|
; GFX8: s_nop 1
|
2019-06-13 02:02:41 +08:00
|
|
|
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0{{$}}
|
|
|
|
define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
|
|
|
|
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0
|
|
|
|
store i32 %tmp0, i32 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-10-16 00:41:15 +08:00
|
|
|
; GCN-LABEL: {{^}}dpp_test1:
|
2019-06-13 02:02:41 +08:00
|
|
|
; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
2020-10-20 05:38:02 +08:00
|
|
|
; GFX8-NOOPT: s_nop 1
|
|
|
|
; GFX8-OPT: s_nop 0
|
|
|
|
; GFX8-OPT-NEXT: s_nop 0
|
2019-10-16 00:41:15 +08:00
|
|
|
; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
|
2018-07-16 18:02:41 +08:00
|
|
|
@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
|
|
|
|
define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
|
|
|
|
bb:
|
|
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
%tmp1 = zext i32 %tmp to i64
|
|
|
|
%tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp
|
|
|
|
%tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4
|
2019-03-26 04:50:21 +08:00
|
|
|
fence syncscope("workgroup-one-as") release
|
2018-07-16 18:02:41 +08:00
|
|
|
tail call void @llvm.amdgcn.s.barrier()
|
2019-03-26 04:50:21 +08:00
|
|
|
fence syncscope("workgroup-one-as") acquire
|
2018-07-16 18:02:41 +08:00
|
|
|
%tmp4 = add nsw i32 %tmp3, %tmp3
|
|
|
|
%tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
|
|
|
|
%tmp6 = add nsw i32 %tmp5, %tmp4
|
|
|
|
%tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
|
|
|
|
store i32 %tmp6, i32* %tmp7, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-10-16 00:41:15 +08:00
|
|
|
; GCN-LABEL: {{^}}update_dpp64_test:
|
|
|
|
; GCN: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
|
|
|
|
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
|
|
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
|
|
|
|
%load = load i64, i64 addrspace(1)* %gep
|
|
|
|
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
|
|
|
|
store i64 %tmp0, i64 addrspace(1)* %gep
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
|
|
|
|
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
|
|
|
|
; GFX8-NOOPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
|
|
|
|
; GCN-DAG: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) {
|
|
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
|
|
|
|
%load = load i64, i64 addrspace(1)* %gep
|
|
|
|
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
|
|
|
|
store i64 %tmp0, i64 addrspace(1)* %gep
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; GCN-LABEL: {{^}}update_dpp64_imm_src_test:
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
|
|
|
|
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
|
|
|
|
; GFX8-NOOPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
|
|
|
|
define amdgpu_kernel void @update_dpp64_imm_src_test(i64 addrspace(1)* %out, i64 %in1) {
|
|
|
|
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
|
|
|
|
store i64 %tmp0, i64 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-07-16 18:02:41 +08:00
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
declare void @llvm.amdgcn.s.barrier()
|
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic
Summary:
Now that we've made all the necessary backend changes, we can add a new
intrinsic which exposes the new capabilities to IR producers. Since
llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we
should deprecate the former. We also add tests for all the functionality
that was added in previous changes, now that we can access it via an IR
construct.
Reviewers: tstellar, arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D34718
llvm-svn: 310399
2017-08-09 02:52:22 +08:00
|
|
|
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
|
2019-10-16 00:41:15 +08:00
|
|
|
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
|
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic
Summary:
Now that we've made all the necessary backend changes, we can add a new
intrinsic which exposes the new capabilities to IR producers. Since
llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we
should deprecate the former. We also add tests for all the functionality
that was added in previous changes, now that we can access it via an IR
construct.
Reviewers: tstellar, arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D34718
llvm-svn: 310399
2017-08-09 02:52:22 +08:00
|
|
|
|
|
|
|
attributes #0 = { nounwind readnone convergent }
|