forked from OSchip/llvm-project
[AMDGPU] Convert insert_vector_elt into set of selects
This allows to avoid scratch use or indirect VGPR addressing for small vectors. Differential Revision: https://reviews.llvm.org/D54606 llvm-svn: 347231
This commit is contained in:
parent
fe034625df
commit
054f8101f1
|
@ -679,6 +679,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
|
||||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
|
||||
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
|
||||
|
||||
// All memory operations. Some folding on the pointer operand is done to help
|
||||
// matching the constant offsets in the addressing modes.
|
||||
|
@ -8114,6 +8115,43 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue
|
||||
SITargetLowering::performInsertVectorEltCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SDValue Vec = N->getOperand(0);
|
||||
SDValue Idx = N->getOperand(2);
|
||||
EVT VecVT = Vec.getValueType();
|
||||
EVT EltVT = VecVT.getVectorElementType();
|
||||
unsigned VecSize = VecVT.getSizeInBits();
|
||||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
|
||||
// INSERT_VECTOR_ELT (<n x e>, var-idx)
|
||||
// => BUILD_VECTOR n x select (e, const-idx)
|
||||
// This elminates non-constant index and subsequent movrel or scratch access.
|
||||
// Sub-dword vectors of size 2 dword or less have better implementation.
|
||||
// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
|
||||
// instructions.
|
||||
if (isa<ConstantSDNode>(Idx) ||
|
||||
VecSize > 256 || (VecSize <= 64 && EltSize < 32))
|
||||
return SDValue();
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
SDLoc SL(N);
|
||||
SDValue Ins = N->getOperand(1);
|
||||
EVT IdxVT = Idx.getValueType();
|
||||
|
||||
SDValue V;
|
||||
SmallVector<SDValue, 16> Ops;
|
||||
for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
|
||||
SDValue IC = DAG.getConstant(I, SL, IdxVT);
|
||||
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
|
||||
SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
|
||||
Ops.push_back(V);
|
||||
}
|
||||
|
||||
return DAG.getBuildVector(VecVT, SL, Ops);
|
||||
}
|
||||
|
||||
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
|
||||
const SDNode *N0,
|
||||
const SDNode *N1) const {
|
||||
|
@ -8722,6 +8760,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
|||
}
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return performExtractVectorEltCombine(N, DCI);
|
||||
case ISD::INSERT_VECTOR_ELT:
|
||||
return performInsertVectorEltCombine(N, DCI);
|
||||
}
|
||||
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
||||
}
|
||||
|
|
|
@ -152,6 +152,7 @@ private:
|
|||
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
unsigned getFusedOpcode(const SelectionDAG &DAG,
|
||||
const SDNode *N0, const SDNode *N1) const;
|
||||
|
|
|
@ -7,14 +7,12 @@
|
|||
|
||||
|
||||
; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
|
||||
; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:3]], s[[S_ELT0]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
|
||||
|
||||
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -43,10 +41,10 @@
|
|||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[READLANE]]
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, vcc
|
||||
|
@ -55,7 +53,7 @@
|
|||
; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
|
||||
|
||||
; GCN: buffer_store_dword [[INS0]]
|
||||
define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
|
||||
define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%id.ext = zext i32 %id to i64
|
||||
|
@ -63,9 +61,9 @@ entry:
|
|||
%idx0 = load volatile i32, i32 addrspace(1)* %gep
|
||||
%idx1 = add i32 %idx0, 1
|
||||
%live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
|
||||
%vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
|
||||
%vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
|
||||
store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
|
||||
%vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
|
||||
%vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
|
||||
store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
|
||||
%cmp = icmp eq i32 %id, 0
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
|
||||
|
|
|
@ -9,11 +9,14 @@
|
|||
; CHECK: s_load_dword [[IN:s[0-9]+]]
|
||||
; CHECK: s_mov_b32 m0, [[IN]]
|
||||
; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
|
||||
; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
|
||||
define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
|
||||
; CHECK: buffer_store_dwordx4
|
||||
; CHECK: buffer_store_dwordx4
|
||||
; CHECK: buffer_store_dwordx4
|
||||
; CHECK: buffer_store_dwordx4
|
||||
define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
|
||||
entry:
|
||||
%ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
|
||||
store <4 x float> %ins, <4 x float> addrspace(1)* %out
|
||||
%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
|
||||
store <16 x float> %ins, <16 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -10,13 +10,11 @@
|
|||
|
||||
|
||||
; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
|
||||
; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
|
||||
; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
|
||||
|
||||
; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
|
||||
|
@ -46,10 +44,10 @@
|
|||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_mov_b32 m0, [[READLANE]]
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63
|
||||
|
||||
; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, vcc
|
||||
|
@ -58,7 +56,7 @@
|
|||
; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
|
||||
|
||||
; GCN: buffer_store_dword [[INS0]]
|
||||
define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
|
||||
define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%id.ext = zext i32 %id to i64
|
||||
|
@ -66,9 +64,9 @@ entry:
|
|||
%idx0 = load volatile i32, i32 addrspace(1)* %gep
|
||||
%idx1 = add i32 %idx0, 1
|
||||
%live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
|
||||
%vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
|
||||
%vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
|
||||
store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
|
||||
%vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
|
||||
%vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
|
||||
store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
|
||||
%cmp = icmp eq i32 %id, 0
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
|
||||
|
|
|
@ -182,15 +182,16 @@ entry:
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
|
||||
; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
|
||||
|
||||
; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
|
||||
; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
|
||||
define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
|
||||
define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
|
||||
entry:
|
||||
%0 = add i32 %in, 1
|
||||
%1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
|
||||
store <4 x float> %1, <4 x float> addrspace(1)* %out
|
||||
%add = add i32 %in, 1
|
||||
%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
|
||||
store <16 x float> %ins, <16 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -205,27 +206,27 @@ entry:
|
|||
; IDXMODE-NEXT: s_set_gpr_idx_off
|
||||
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
|
||||
define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
|
||||
define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
|
||||
entry:
|
||||
%0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
|
||||
store <4 x float> %0, <4 x float> addrspace(1)* %out
|
||||
%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
|
||||
store <16 x float> %ins, <16 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
|
||||
; The offset depends on the register that holds the first element of the vector.
|
||||
; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
|
||||
; MOVREL: v_movreld_b32_e32 v0, 5
|
||||
; MOVREL: v_movreld_b32_e32 v0, 16
|
||||
|
||||
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
|
||||
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
|
||||
; IDXMODE-NEXT: s_set_gpr_idx_off
|
||||
define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
|
||||
define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) {
|
||||
entry:
|
||||
%index = add i32 %offset, -512
|
||||
%value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
|
||||
store <4 x i32> %value, <4 x i32> addrspace(1)* %out
|
||||
%value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
|
||||
store <16 x i32> %value, <16 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -241,11 +242,11 @@ entry:
|
|||
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
|
||||
; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
|
||||
; IDXMODE-NEXT: s_set_gpr_idx_off
|
||||
define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
|
||||
define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {
|
||||
entry:
|
||||
%index = add i32 %offset, -512
|
||||
%value = insertelement <4 x i32> %vec, i32 5, i32 %index
|
||||
store <4 x i32> %value, <4 x i32> addrspace(1)* %out
|
||||
%value = insertelement <16 x i32> %vec, i32 5, i32 %index
|
||||
store <16 x i32> %value, <16 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -256,6 +257,18 @@ entry:
|
|||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
|
||||
|
||||
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
|
||||
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
|
||||
|
@ -263,23 +276,23 @@ entry:
|
|||
; GCN: s_and_saveexec_b64 vcc, vcc
|
||||
|
||||
; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
|
||||
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
|
||||
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33
|
||||
|
||||
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
|
||||
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
|
||||
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
|
||||
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: s_cbranch_execnz [[LOOPBB]]
|
||||
; GCN: s_mov_b64 exec, [[SAVEEXEC]]
|
||||
|
||||
; GCN: buffer_store_dword
|
||||
define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
|
||||
define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%index = add i32 %id, -512
|
||||
%value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 5, i32 %index
|
||||
store <4 x i32> %value, <4 x i32> addrspace(1)* %out
|
||||
%value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
|
||||
store <16 x i32> %value, <16 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -289,6 +302,18 @@ entry:
|
|||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
|
||||
|
||||
; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
|
||||
|
@ -305,12 +330,12 @@ entry:
|
|||
; IDXMODE: s_set_gpr_idx_off
|
||||
|
||||
; GCN: s_cbranch_execnz
|
||||
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
|
||||
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
|
||||
entry:
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%index = add i32 %id, -16
|
||||
%value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 500, i32 %index
|
||||
store <4 x i32> %value, <4 x i32> addrspace(1)* %out
|
||||
%value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
|
||||
store <16 x i32> %value, <16 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -428,10 +453,10 @@ bb7: ; preds = %bb4, %bb1
|
|||
; GCN: s_load_dword [[ARG:s[0-9]+]]
|
||||
|
||||
; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
|
||||
; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
|
||||
; MOVREL: s_waitcnt
|
||||
; MOVREL: s_add_i32 m0, [[ARG]], -16
|
||||
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
|
||||
; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
|
||||
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
|
||||
; MOVREL: s_mov_b32 m0, -1
|
||||
|
||||
|
@ -453,13 +478,13 @@ bb7: ; preds = %bb4, %bb1
|
|||
define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
|
||||
bb:
|
||||
%tmp1 = add i32 %arg, -16
|
||||
%tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
|
||||
%tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
|
||||
%tmp3 = add i32 %arg, -16
|
||||
%tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float -4.0, i32 %tmp3
|
||||
%tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
|
||||
%tmp6 = extractelement <6 x i32> %tmp5, i32 1
|
||||
%tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
|
||||
%tmp8 = extractelement <6 x i32> %tmp7, i32 5
|
||||
%tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
|
||||
%tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
|
||||
%tmp6 = extractelement <9 x i32> %tmp5, i32 1
|
||||
%tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
|
||||
%tmp8 = extractelement <9 x i32> %tmp7, i32 5
|
||||
store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
|
||||
store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
|
||||
ret void
|
||||
|
@ -531,7 +556,7 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
|
||||
; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
|
||||
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
|
||||
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
|
||||
; GCN-NOT: [[IDX_SHL]]
|
||||
|
@ -542,11 +567,11 @@ entry:
|
|||
; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
|
||||
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; IDXMODE: s_set_gpr_idx_off
|
||||
define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
|
||||
define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
|
||||
%idx.shl = shl i32 %idx.in, 2
|
||||
%idx = or i32 %idx.shl, 1
|
||||
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
|
||||
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
|
||||
%vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
|
||||
store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -581,9 +606,9 @@ bb2: ; preds = %bb4, %bb
|
|||
|
||||
bb4: ; preds = %bb2
|
||||
%vgpr = load volatile i32, i32 addrspace(1)* undef
|
||||
%tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr
|
||||
%tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr
|
||||
%tmp7 = extractelement <8 x i32> %tmp6, i32 0
|
||||
%tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
|
||||
%tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
|
||||
%tmp7 = extractelement <16 x i32> %tmp6, i32 0
|
||||
br label %bb2
|
||||
|
||||
bb8: ; preds = %bb2
|
||||
|
|
|
@ -0,0 +1,312 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}float4_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
|
||||
; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
|
||||
define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
|
||||
store <4 x float> %v, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}float4_inselt_undef:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-NOT: v_cmp_
|
||||
; GCN-NOT: v_cndmask_
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
|
||||
define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
|
||||
store <4 x float> %v, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}int4_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]]
|
||||
; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
|
||||
define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <4 x i32> %vec, i32 1, i32 %sel
|
||||
store <4 x i32> %v, <4 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}float2_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
|
||||
; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
|
||||
define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
|
||||
store <2 x float> %v, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}float8_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
|
||||
; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
|
||||
; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
|
||||
define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
|
||||
store <8 x float> %v, <8 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}float16_inselt:
|
||||
; GCN: v_movreld_b32
|
||||
define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
|
||||
store <16 x float> %v, <16 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}half4_inselt:
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
|
||||
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
|
||||
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00
|
||||
define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
|
||||
store <4 x half> %v, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}half2_inselt:
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
|
||||
; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
|
||||
; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
|
||||
store <2 x half> %v, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}half8_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
|
||||
store <8 x half> %v, <8 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}short2_inselt:
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
|
||||
; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
|
||||
; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <2 x i16> %vec, i16 1, i32 %sel
|
||||
store <2 x i16> %v, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}short4_inselt:
|
||||
; GCN-NOT: v_cndmask_b32
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
|
||||
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
|
||||
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
|
||||
define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <4 x i16> %vec, i16 1, i32 %sel
|
||||
store <4 x i16> %v, <4 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}byte8_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
|
||||
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
|
||||
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
|
||||
define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
|
||||
store <8 x i8> %v, <8 x i8> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}byte16_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_cndmask_b32_e32
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
; GCN-DAG: v_or_b32_sdwa
|
||||
define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <16 x i8> %vec, i8 1, i32 %sel
|
||||
store <16 x i8> %v, <16 x i8> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}double2_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
|
||||
define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
|
||||
store <2 x double> %v, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}double8_inselt:
|
||||
; GCN-NOT: v_cndmask
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
|
||||
store <8 x double> %v, <8 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}bit4_inselt:
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_load_ubyte
|
||||
; GCN: buffer_load_ubyte
|
||||
; GCN: buffer_load_ubyte
|
||||
; GCN: buffer_load_ubyte
|
||||
define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <4 x i1> %vec, i1 1, i32 %sel
|
||||
store <4 x i1> %v, <4 x i1> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}bit128_inselt:
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
|
||||
; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
|
||||
define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <128 x i1> %vec, i1 1, i32 %sel
|
||||
store <128 x i1> %v, <128 x i1> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
|
@ -83,8 +83,11 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
|
||||
; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
|
||||
; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
|
||||
define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
|
||||
|
@ -93,9 +96,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
|
||||
; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
|
||||
; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
|
||||
; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: buffer_store_dwordx2 v
|
||||
; GCN-DAG: buffer_store_dword v
|
||||
define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
|
||||
|
@ -104,8 +112,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
|
||||
; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
|
||||
; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
|
||||
define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
|
||||
|
@ -114,7 +129,11 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
|
||||
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
|
||||
|
@ -136,8 +155,11 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
|
||||
; GCN: v_movreld_b32
|
||||
; GCN: buffer_store_dwordx2
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
|
||||
define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <2 x i32> %a, i32 5, i32 %b
|
||||
store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
|
||||
|
@ -145,8 +167,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
|
||||
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5
|
||||
; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN-DAG: buffer_store_dwordx2 v
|
||||
; GCN-DAG: buffer_store_dword v
|
||||
define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <3 x i32> %a, i32 5, i32 %b
|
||||
|
@ -156,8 +183,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
|
|||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
|
||||
; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
|
||||
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
||||
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC3]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]]
|
||||
; GCN: buffer_store_dwordx4
|
||||
define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
|
||||
%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
|
||||
|
@ -166,7 +200,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
|
||||
; GCN: v_movreld_b32
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
|
||||
|
@ -288,24 +325,13 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
|
|||
; GCN: s_load_dwordx4
|
||||
; GCN: s_load_dword s
|
||||
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_byte
|
||||
; GCN-NOT: buffer_store_byte
|
||||
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
|
||||
|
||||
; GCN: buffer_store_byte
|
||||
; GCN: buffer_store_dwordx4
|
||||
define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <16 x i8> %a, i8 5, i32 %b
|
||||
|
@ -343,23 +369,18 @@ endif:
|
|||
; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
|
||||
|
||||
; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
|
||||
|
||||
; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
|
||||
; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
|
||||
|
||||
; Increment to next element folded into base register, but FileCheck
|
||||
; can't do math expressions
|
||||
|
||||
; FIXME: Should be able to manipulate m0 directly instead of s_lshl_b32 + copy to m0
|
||||
|
||||
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
|
||||
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: s_endpgm
|
||||
|
@ -371,8 +392,12 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
|
|||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:
|
||||
|
||||
; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 5
|
||||
; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
|
||||
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: s_endpgm
|
||||
|
@ -383,34 +408,41 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC3]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
|
||||
define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <3 x i64> %a, i64 5, i32 %b
|
||||
store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to do without stack access. The used stack
|
||||
; space is also 2x what should be required.
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
|
||||
|
||||
; Stack store
|
||||
; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC3]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
|
||||
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
|
||||
|
||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
|
||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
|
||||
|
||||
; Write element
|
||||
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
|
||||
|
||||
; Stack reload
|
||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
|
||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
|
||||
|
||||
; Store result
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: s_endpgm
|
||||
; GCN: ScratchSize: 64
|
||||
; GCN: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <4 x double> %a, double 8.0, i32 %b
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
; GCN: ; return
|
||||
define amdgpu_ps float @main(i32 inreg %arg) #0 {
|
||||
main_body:
|
||||
%tmp24 = insertelement <2 x float> undef, float 0.000000e+00, i32 %arg
|
||||
%tmp25 = extractelement <2 x float> %tmp24, i32 1
|
||||
%tmp24 = insertelement <16 x float> undef, float 0.000000e+00, i32 %arg
|
||||
%tmp25 = extractelement <16 x float> %tmp24, i32 1
|
||||
ret float %tmp25
|
||||
}
|
||||
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
|
||||
|
||||
; GCN-LABEL: {{^}}float4_alloca_store4:
|
||||
; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
|
||||
|
||||
; GFX-NOT: buffer_
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
|
||||
|
@ -36,11 +36,15 @@ entry:
|
|||
; GCN-LABEL: {{^}}float4_alloca_load4:
|
||||
; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
|
||||
|
||||
; GFX-NOT: buffer_
|
||||
; GCN: v_readfirstlane_b32
|
||||
; GFX8: v_movreld_b32
|
||||
; GFX9: s_set_gpr_idx_on
|
||||
; GFX9: s_set_gpr_idx_off
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-NOT: v_cmp_
|
||||
; GCN-NOT: v_cndmask_
|
||||
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: store_dwordx4 v[{{[0-9:]+}}],
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
|
||||
|
@ -68,7 +72,7 @@ entry:
|
|||
; GCN-LABEL: {{^}}half4_alloca_store4:
|
||||
; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
|
||||
|
||||
; GFX-NOT: buffer_
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
|
||||
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
|
||||
|
@ -98,7 +102,7 @@ entry:
|
|||
; GCN-LABEL: {{^}}half4_alloca_load4:
|
||||
; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
|
||||
|
||||
; GFX-NOT: buffer_
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
|
||||
|
||||
|
@ -128,7 +132,7 @@ entry:
|
|||
; GCN-LABEL: {{^}}short4_alloca_store4:
|
||||
; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
|
||||
|
||||
; GFX-NOT: buffer_
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
|
||||
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
|
||||
|
@ -158,7 +162,7 @@ entry:
|
|||
; GCN-LABEL: {{^}}short4_alloca_load4:
|
||||
; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
|
||||
|
||||
; GFX-NOT: buffer_
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
|
||||
|
||||
|
|
|
@ -27,7 +27,10 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %o
|
|||
|
||||
; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32:
|
||||
; GCN: buffer_load_dwordx4
|
||||
; GCN: v_movreld_b32
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32
|
||||
; GCN: v_cndmask_b32
|
||||
|
|
Loading…
Reference in New Issue