forked from OSchip/llvm-project
AMDGPU: Other sizes of popcnt are fast
We can chain bcnt instructions together, so any width popcnt is pretty fast. llvm-svn: 269950
This commit is contained in:
parent
8eb336c14e
commit
1735da460b
|
@ -76,7 +76,7 @@ public:
|
||||||
|
|
||||||
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
|
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
|
||||||
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
|
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
|
||||||
return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software;
|
return TTI::PSK_FastHardware;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned getNumberOfRegisters(bool Vector);
|
unsigned getNumberOfRegisters(bool Vector);
|
||||||
|
|
|
@ -7,6 +7,9 @@ declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
|
||||||
declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
|
declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
|
||||||
declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
|
declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
|
||||||
|
|
||||||
|
declare i65 @llvm.ctpop.i65(i65) nounwind readnone
|
||||||
|
declare i128 @llvm.ctpop.i128(i128) nounwind readnone
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}s_ctpop_i64:
|
; FUNC-LABEL: {{^}}s_ctpop_i64:
|
||||||
; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||||
; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||||
|
@ -141,3 +144,52 @@ endif:
|
||||||
store i64 %tmp5, i64 addrspace(1)* %out
|
store i64 %tmp5, i64 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}s_ctpop_i128:
|
||||||
|
; GCN: s_bcnt1_i32_b64 [[SRESULT0:s[0-9]+]],
|
||||||
|
; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
|
||||||
|
; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT0]], [[SRESULT1]]
|
||||||
|
; GCN: s_endpgm
|
||||||
|
define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
|
||||||
|
%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
|
||||||
|
%truncctpop = trunc i128 %ctpop to i32
|
||||||
|
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}s_ctpop_i65:
|
||||||
|
; GCN: s_bcnt1_i32_b64
|
||||||
|
; GCN: s_and_b32
|
||||||
|
; GCN: s_bcnt1_i32_b64
|
||||||
|
; GCN: s_add_i32
|
||||||
|
; GCN: s_endpgm
|
||||||
|
define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
|
||||||
|
%ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
|
||||||
|
%truncctpop = trunc i65 %ctpop to i32
|
||||||
|
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; FIXME: Should not have extra add
|
||||||
|
|
||||||
|
; FUNC-LABEL: {{^}}v_ctpop_i128:
|
||||||
|
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||||
|
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
|
||||||
|
|
||||||
|
; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v[[VAL2]], 0
|
||||||
|
; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
|
||||||
|
|
||||||
|
; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
|
||||||
|
; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v[[VAL1]], [[MIDRESULT2]]
|
||||||
|
|
||||||
|
; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT2]], [[MIDRESULT1]]
|
||||||
|
|
||||||
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
|
; GCN: s_endpgm
|
||||||
|
define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
|
||||||
|
%val = load i128, i128 addrspace(1)* %in, align 8
|
||||||
|
%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
|
||||||
|
%truncctpop = trunc i128 %ctpop to i32
|
||||||
|
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
; RUN: opt -loop-idiom -mtriple=r600-- -mcpu=SI -S < %s | FileCheck %s
|
; RUN: opt -loop-idiom -mtriple=amdgcn-- -S < %s | FileCheck %s
|
||||||
|
|
||||||
; Mostly copied from x86 version.
|
; Mostly copied from x86 version.
|
||||||
|
|
||||||
|
@ -59,6 +59,29 @@ while.end: ; preds = %while.body, %entry
|
||||||
ret i32 %c.0.lcssa
|
ret i32 %c.0.lcssa
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; CHECK-LABEL: @popcount_i128
|
||||||
|
; CHECK: entry
|
||||||
|
; CHECK: llvm.ctpop.i128
|
||||||
|
; CHECK: ret
|
||||||
|
define i32 @popcount_i128(i128 %a) nounwind uwtable readnone ssp {
|
||||||
|
entry:
|
||||||
|
%tobool3 = icmp eq i128 %a, 0
|
||||||
|
br i1 %tobool3, label %while.end, label %while.body
|
||||||
|
|
||||||
|
while.body: ; preds = %entry, %while.body
|
||||||
|
%c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
|
||||||
|
%a.addr.04 = phi i128 [ %and, %while.body ], [ %a, %entry ]
|
||||||
|
%inc = add nsw i32 %c.05, 1
|
||||||
|
%sub = add i128 %a.addr.04, -1
|
||||||
|
%and = and i128 %sub, %a.addr.04
|
||||||
|
%tobool = icmp eq i128 %and, 0
|
||||||
|
br i1 %tobool, label %while.end, label %while.body
|
||||||
|
|
||||||
|
while.end: ; preds = %while.body, %entry
|
||||||
|
%c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
|
||||||
|
ret i32 %c.0.lcssa
|
||||||
|
}
|
||||||
|
|
||||||
; To recognize this pattern:
|
; To recognize this pattern:
|
||||||
;int popcount(unsigned long long a, int mydata1, int mydata2) {
|
;int popcount(unsigned long long a, int mydata1, int mydata2) {
|
||||||
; int c = 0;
|
; int c = 0;
|
||||||
|
|
Loading…
Reference in New Issue