2017-10-13 03:37:14 +08:00
|
|
|
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-NOSDWA -check-prefix=FUNC %s
|
|
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-SDWA -check-prefix=FUNC %s
|
|
|
|
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
|
2014-06-18 01:36:27 +08:00
|
|
|
|
2017-10-13 03:37:14 +08:00
|
|
|
declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
|
|
|
|
declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
|
|
|
|
declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
|
2014-06-18 01:36:27 +08:00
|
|
|
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
|
2017-10-13 03:37:14 +08:00
|
|
|
declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
|
2014-06-18 01:36:27 +08:00
|
|
|
declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
|
|
|
|
declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
|
2020-02-10 05:38:56 +08:00
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
2014-06-18 01:36:27 +08:00
|
|
|
|
2014-10-02 01:15:17 +08:00
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
|
2014-11-05 22:50:53 +08:00
|
|
|
; SI: s_load_dword [[VAL:s[0-9]+]],
|
|
|
|
; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
|
|
|
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
|
|
|
; SI: buffer_store_dword [[VRESULT]],
|
|
|
|
; SI: s_endpgm
|
2014-07-15 23:51:09 +08:00
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
2014-06-18 01:36:27 +08:00
|
|
|
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
|
|
store i32 %cttz, i32 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2014-10-02 01:15:17 +08:00
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
|
2017-07-05 01:32:00 +08:00
|
|
|
; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
|
2014-11-05 22:50:53 +08:00
|
|
|
; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
|
|
|
|
; SI: buffer_store_dword [[RESULT]],
|
|
|
|
; SI: s_endpgm
|
2014-07-15 23:51:09 +08:00
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
|
2020-02-10 05:38:56 +08:00
|
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
2017-07-05 01:32:00 +08:00
|
|
|
%in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
|
|
|
|
%val = load i32, i32 addrspace(1)* %in.gep, align 4
|
2014-06-18 01:36:27 +08:00
|
|
|
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
|
|
store i32 %cttz, i32 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2014-10-02 01:15:17 +08:00
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
|
2017-07-05 01:32:00 +08:00
|
|
|
; SI: {{buffer|flat}}_load_dwordx2
|
2014-11-05 22:50:53 +08:00
|
|
|
; SI: v_ffbl_b32_e32
|
|
|
|
; SI: v_ffbl_b32_e32
|
|
|
|
; SI: buffer_store_dwordx2
|
|
|
|
; SI: s_endpgm
|
2014-07-15 23:51:09 +08:00
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
|
2020-02-10 05:38:56 +08:00
|
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
2017-07-05 01:32:00 +08:00
|
|
|
%in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
|
|
|
|
%val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
|
2014-06-18 01:36:27 +08:00
|
|
|
%cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
|
|
|
|
store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2014-10-02 01:15:17 +08:00
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
|
2017-07-05 01:32:00 +08:00
|
|
|
; SI: {{buffer|flat}}_load_dwordx4
|
2014-11-05 22:50:53 +08:00
|
|
|
; SI: v_ffbl_b32_e32
|
|
|
|
; SI: v_ffbl_b32_e32
|
|
|
|
; SI: v_ffbl_b32_e32
|
|
|
|
; SI: v_ffbl_b32_e32
|
|
|
|
; SI: buffer_store_dwordx4
|
|
|
|
; SI: s_endpgm
|
2014-07-15 23:51:09 +08:00
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
|
2020-02-10 05:38:56 +08:00
|
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
2017-07-05 01:32:00 +08:00
|
|
|
%in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
|
|
|
|
%val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
|
2014-06-18 01:36:27 +08:00
|
|
|
%cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
|
|
|
|
store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
|
|
|
|
ret void
|
|
|
|
}
|
2017-10-13 03:37:14 +08:00
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i8_with_select:
|
|
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
|
|
; EG: MEM_RAT MSKOR
|
|
|
|
; EG: FFBL_INT
|
|
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind {
|
|
|
|
%cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i8 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i8 %cttz, i8 32
|
|
|
|
store i8 %cttz, i8 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i16_with_select:
|
|
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
|
|
; EG: MEM_RAT MSKOR
|
|
|
|
; EG: FFBL_INT
|
|
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
|
|
|
|
%cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i16 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i16 %cttz, i16 32
|
|
|
|
store i16 %cttz, i16 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32_with_select:
|
|
|
|
; SI: s_ff1_i32_b32
|
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
|
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i32 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i32 %cttz, i32 32
|
|
|
|
store i32 %cttz, i32 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i64_with_select:
|
|
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
|
|
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i64 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i64 %cttz, i64 32
|
|
|
|
store i64 %cttz, i64 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select:
|
|
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
2020-02-08 13:33:39 +08:00
|
|
|
; SI-SDWA: v_ffbl_b32_e32
|
2017-10-13 03:37:14 +08:00
|
|
|
; EG: MEM_RAT MSKOR
|
|
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i8, i8 addrspace(1)* %arrayidx, align 1
|
|
|
|
%cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i8 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i8 %cttz, i8 32
|
|
|
|
store i8 %ret, i8 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select:
|
|
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
2020-02-08 13:33:39 +08:00
|
|
|
; SI-SDWA: v_ffbl_b32_e32
|
2017-10-13 03:37:14 +08:00
|
|
|
; EG: MEM_RAT MSKOR
|
|
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i16, i16 addrspace(1)* %arrayidx, align 1
|
|
|
|
%cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i16 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i16 %cttz, i16 32
|
|
|
|
store i16 %ret, i16 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32_with_select:
|
[AMDGPU] Select s_cselect
Summary:
Add patterns to select s_cselect in the isel.
Handle more cases of implicit SCC accesses in si-fix-sgpr-copies
to allow new patterns to work.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, asbirlea, kerbowa, llvm-commits
Tags: #llvm
Re-commit D81925 with a bugfix D82370.
Differential Revision: https://reviews.llvm.org/D81925
Differential Revision: https://reviews.llvm.org/D82370
2020-03-04 22:13:08 +08:00
|
|
|
; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; SI-DAG: v_cmp_ne_u32_e32 vcc, 0
|
2017-10-13 03:37:14 +08:00
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i32 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i32 %cttz, i32 32
|
|
|
|
store i32 %ret, i32 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i64_with_select:
|
|
|
|
; SI-NOSDWA: v_or_b32_e32
|
|
|
|
; SI-NOSDWA: v_or_b32_e32
|
|
|
|
; SI-NOSDWA: v_or_b32_e32
|
|
|
|
; SI-NOSDWA: v_or_b32_e32
|
2019-12-20 22:59:15 +08:00
|
|
|
; SI-NOSDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
|
|
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
|
2020-01-07 23:43:46 +08:00
|
|
|
; SI-SDWA: v_or_b32_e32
|
[AMDGPU/MemOpsCluster] Clean-up fixme's around mem ops clustering logic
Get rid of all fixmes and base heuristic on `num-clustered-dwords`. The main intuition behind this is as
follows. The existing heuristic roughly summarizes as below:
* Assume, all the mem ops instructions participating in the clustering process, loads/stores same num bytes
* If num bytes loaded by each mem op is 4 bytes, then cluster at max 5 mem ops, that is at max 20 bytes
* If num bytes loaded by each mem op is 8 bytes, then cluster at max 3 mem ops, that is at max 24 bytes
* If num bytes loaded by each mem op is 16 bytes, then cluster at max 2 mem ops, that is at max 32 bytes
So, we need to make sure that the new heuristic do not completey deviate away from the above one, and it
properly handles both the sub-word loads and the wide loads.
Reviewed By: arsenm, rampitec
Differential Revision: https://reviews.llvm.org/D84354
2020-07-31 00:09:34 +08:00
|
|
|
; SI-SDWA: v_or_b32_sdwa
|
2020-01-07 23:43:46 +08:00
|
|
|
; SI-SDWA: v_or_b32_e32
|
2017-10-13 03:37:14 +08:00
|
|
|
; SI-SDWA: v_or_b32_sdwa
|
[AMDGPU/MemOpsCluster] Clean-up fixme's around mem ops clustering logic
Get rid of all fixmes and base heuristic on `num-clustered-dwords`. The main intuition behind this is as
follows. The existing heuristic roughly summarizes as below:
* Assume, all the mem ops instructions participating in the clustering process, loads/stores same num bytes
* If num bytes loaded by each mem op is 4 bytes, then cluster at max 5 mem ops, that is at max 20 bytes
* If num bytes loaded by each mem op is 8 bytes, then cluster at max 3 mem ops, that is at max 24 bytes
* If num bytes loaded by each mem op is 16 bytes, then cluster at max 2 mem ops, that is at max 32 bytes
So, we need to make sure that the new heuristic do not completey deviate away from the above one, and it
properly handles both the sub-word loads and the wide loads.
Reviewed By: arsenm, rampitec
Differential Revision: https://reviews.llvm.org/D84354
2020-07-31 00:09:34 +08:00
|
|
|
; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
2019-12-20 22:59:15 +08:00
|
|
|
; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
[AMDGPU/MemOpsCluster] Clean-up fixme's around mem ops clustering logic
Get rid of all fixmes and base heuristic on `num-clustered-dwords`. The main intuition behind this is as
follows. The existing heuristic roughly summarizes as below:
* Assume, all the mem ops instructions participating in the clustering process, loads/stores same num bytes
* If num bytes loaded by each mem op is 4 bytes, then cluster at max 5 mem ops, that is at max 20 bytes
* If num bytes loaded by each mem op is 8 bytes, then cluster at max 3 mem ops, that is at max 24 bytes
* If num bytes loaded by each mem op is 16 bytes, then cluster at max 2 mem ops, that is at max 32 bytes
So, we need to make sure that the new heuristic do not completey deviate away from the above one, and it
properly handles both the sub-word loads and the wide loads.
Reviewed By: arsenm, rampitec
Differential Revision: https://reviews.llvm.org/D84354
2020-07-31 00:09:34 +08:00
|
|
|
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
|
2019-12-20 22:59:15 +08:00
|
|
|
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
|
2017-10-18 05:49:52 +08:00
|
|
|
; SI: v_cmp_eq_u32_e32 vcc, 0
|
|
|
|
; SI: v_cmp_ne_u64_e32 vcc, 0
|
2017-10-13 03:37:14 +08:00
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i64, i64 addrspace(1)* %arrayidx, align 1
|
|
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
|
|
|
|
%cttz_ret = icmp ne i64 %val, 0
|
|
|
|
%ret = select i1 %cttz_ret, i64 %cttz, i64 32
|
|
|
|
store i64 %ret, i64 addrspace(1)* %out, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i32_sel_eq_neg1:
|
2020-07-14 17:12:30 +08:00
|
|
|
; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}}
|
|
|
|
; SI: buffer_store_dword [[VAL]],
|
2017-10-13 03:37:14 +08:00
|
|
|
; SI: s_endpgm
|
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW
|
|
|
|
; EG: FFBL_INT
|
|
|
|
define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
|
|
%ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
|
|
|
|
%cmp = icmp eq i32 %val, 0
|
|
|
|
%sel = select i1 %cmp, i32 -1, i32 %ctlz
|
|
|
|
store i32 %sel, i32 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_neg1:
|
2020-07-14 17:12:30 +08:00
|
|
|
; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}}
|
|
|
|
; SI: buffer_store_dword [[VAL]],
|
2017-10-13 03:37:14 +08:00
|
|
|
; SI: s_endpgm
|
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW
|
|
|
|
; EG: FFBL_INT
|
|
|
|
define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
|
|
%ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
|
|
|
|
%cmp = icmp ne i32 %val, 0
|
|
|
|
%sel = select i1 %cmp, i32 %ctlz, i32 -1
|
|
|
|
store i32 %sel, i32 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_bitwidth:
|
|
|
|
; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; SI: v_cmp
|
|
|
|
; SI: v_cndmask
|
|
|
|
; SI: s_endpgm
|
|
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW
|
|
|
|
; EG: FFBL_INT
|
|
|
|
define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
|
|
%ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
|
|
|
|
%cmp = icmp ne i32 %ctlz, 32
|
|
|
|
%sel = select i1 %cmp, i32 %ctlz, i32 -1
|
|
|
|
store i32 %sel, i32 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1:
|
|
|
|
; SI: {{buffer|flat}}_load_ubyte
|
2018-02-07 07:54:37 +08:00
|
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
2020-02-08 13:33:39 +08:00
|
|
|
; SI-SDWA: v_ffbl_b32_e32
|
2017-10-13 03:37:14 +08:00
|
|
|
; EG: MEM_RAT MSKOR
|
|
|
|
; EG: FFBL_INT
|
|
|
|
define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i8, i8 addrspace(1)* %arrayidx, align 1
|
|
|
|
%ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
|
|
|
|
%cmp = icmp eq i8 %val, 0
|
|
|
|
%sel = select i1 %cmp, i8 -1, i8 %ctlz
|
|
|
|
store i8 %sel, i8 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i16_sel_eq_neg1:
|
|
|
|
; SI: {{buffer|flat}}_load_ubyte
|
|
|
|
; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
|
|
; SI: buffer_store_short
|
|
|
|
; EG: MEM_RAT MSKOR
|
|
|
|
; EG: FFBL_INT
|
|
|
|
define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
|
|
%val = load i16, i16 addrspace(1)* %arrayidx, align 1
|
|
|
|
%ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
|
|
|
|
%cmp = icmp eq i16 %val, 0
|
|
|
|
%sel = select i1 %cmp, i16 -1, i16 %ctlz
|
|
|
|
store i16 %sel, i16 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|