2018-12-01 02:29:17 +08:00
|
|
|
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
|
2018-11-16 09:13:34 +08:00
|
|
|
|
|
|
|
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
|
|
|
|
|
|
|
|
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
|
|
|
|
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
|
|
|
|
|
|
|
|
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
|
[AMDGPU] Fix for vector element insertion
Summary:
Incorrect code was generated when lowering insertelement operations
for vectors with 8 or 16 bit elements. The value being inserted was
not adjusted for the position of the element within the 32 bit word
and so only the low element within each 32 bit word could receive
the intended value.
Fixed by simply replicating the value to each element of a
congruent vector before the mask and or operation used to
update the intended element.
A number of affected LIT tests have been updated appropriately.
before the mask & or into the intended
Reviewers: arsenm, nhaehnle
Reviewed By: arsenm
Subscribers: llvm-commits, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D57588
llvm-svn: 352885
2019-02-02 00:51:09 +08:00
|
|
|
; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7
|
2018-11-16 09:13:34 +08:00
|
|
|
|
|
|
|
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
|
|
|
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
|
|
|
|
|
|
|
|
; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
|
|
|
; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
|
|
|
|
|
|
|
|
; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
|
|
|
|
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
|
|
|
define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
|
|
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
|
|
%tid.ext = sext i32 %tid to i64
|
|
|
|
%in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
|
|
|
|
%idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
|
|
|
|
%out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
|
|
|
|
%idx = load i32, i32 addrspace(1)* %idx.gep
|
|
|
|
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
|
|
|
|
%vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
|
|
|
|
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
|
|
|
|
|
|
attributes #0 = { nounwind }
|
|
|
|
attributes #1 = { nounwind readnone }
|