forked from OSchip/llvm-project
AMDGPU: Add combine for short vector extract_vector_elts
Try to access pieces 4 bytes at a time. This helps various hasOneUse extract_vector_elt combines, such as load width reductions. Avoids test regressions in a future commit. llvm-svn: 334836
This commit is contained in:
parent
02dc7e19e2
commit
63bc0e3cb9
|
@ -7097,8 +7097,11 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
|
|||
SDValue SITargetLowering::performExtractVectorEltCombine(
|
||||
SDNode *N, DAGCombinerInfo &DCI) const {
|
||||
SDValue Vec = N->getOperand(0);
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
|
||||
EVT VecVT = Vec.getValueType();
|
||||
EVT EltVT = VecVT.getVectorElementType();
|
||||
|
||||
if ((Vec.getOpcode() == ISD::FNEG ||
|
||||
Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
|
||||
SDLoc SL(N);
|
||||
|
@ -7139,6 +7142,44 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
|
|||
Vec.getOperand(1), Idx));
|
||||
}
|
||||
}
|
||||
|
||||
if (!DCI.isBeforeLegalize())
|
||||
return SDValue();
|
||||
|
||||
unsigned VecSize = VecVT.getSizeInBits();
|
||||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
|
||||
// Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
|
||||
// elements. This exposes more load reduction opportunities by replacing
|
||||
// multiple small extract_vector_elements with a single 32-bit extract.
|
||||
auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
||||
if (EltSize <= 16 &&
|
||||
EltVT.isByteSized() &&
|
||||
VecSize > 32 &&
|
||||
VecSize % 32 == 0 &&
|
||||
Idx) {
|
||||
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
|
||||
|
||||
unsigned BitIndex = Idx->getZExtValue() * EltSize;
|
||||
unsigned EltIdx = BitIndex / 32;
|
||||
unsigned LeftoverBitIdx = BitIndex % 32;
|
||||
SDLoc SL(N);
|
||||
|
||||
SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
|
||||
DCI.AddToWorklist(Cast.getNode());
|
||||
|
||||
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
|
||||
DAG.getConstant(EltIdx, SL, MVT::i32));
|
||||
DCI.AddToWorklist(Elt.getNode());
|
||||
SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
|
||||
DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
|
||||
DCI.AddToWorklist(Srl.getNode());
|
||||
|
||||
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
|
||||
DCI.AddToWorklist(Trunc.getNode());
|
||||
return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
|
|
@ -141,6 +141,36 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)*
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_01:
|
||||
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(<16 x half> addrspace(4)* %ptr) #0 {
|
||||
%load = load <16 x half>, <16 x half> addrspace(4)* %ptr
|
||||
%elt0 = extractelement <16 x half> %load, i32 0
|
||||
%elt1 = extractelement <16 x half> %load, i32 1
|
||||
store volatile half %elt0, half addrspace(1)* undef, align 2
|
||||
store volatile half %elt1, half addrspace(1)* undef, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_23:
|
||||
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(<16 x half> addrspace(4)* %ptr) #0 {
|
||||
%load = load <16 x half>, <16 x half> addrspace(4)* %ptr
|
||||
%elt2 = extractelement <16 x half> %load, i32 2
|
||||
%elt3 = extractelement <16 x half> %load, i32 3
|
||||
store volatile half %elt2, half addrspace(1)* undef, align 2
|
||||
store volatile half %elt3, half addrspace(1)* undef, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -142,6 +142,36 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)*
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01:
|
||||
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(<16 x i16> addrspace(4)* %ptr) #0 {
|
||||
%load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
|
||||
%elt0 = extractelement <16 x i16> %load, i32 0
|
||||
%elt1 = extractelement <16 x i16> %load, i32 1
|
||||
store volatile i16 %elt0, i16 addrspace(1)* undef, align 2
|
||||
store volatile i16 %elt1, i16 addrspace(1)* undef, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23:
|
||||
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrspace(4)* %ptr) #0 {
|
||||
%load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
|
||||
%elt2 = extractelement <16 x i16> %load, i32 2
|
||||
%elt3 = extractelement <16 x i16> %load, i32 3
|
||||
store volatile i16 %elt2, i16 addrspace(1)* undef, align 2
|
||||
store volatile i16 %elt3, i16 addrspace(1)* undef, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -199,4 +199,78 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0123:
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_load_dword s
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
|
||||
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
|
||||
%load = load <8 x i8>, <8 x i8> addrspace(4)* null
|
||||
%elt0 = extractelement <8 x i8> %load, i32 0
|
||||
%elt1 = extractelement <8 x i8> %load, i32 1
|
||||
%elt2 = extractelement <8 x i8> %load, i32 2
|
||||
%elt3 = extractelement <8 x i8> %load, i32 3
|
||||
store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt2, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt3, i8 addrspace(1)* undef, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0145:
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_load_dwordx2
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
||||
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
|
||||
%load = load <8 x i8>, <8 x i8> addrspace(4)* null
|
||||
%elt0 = extractelement <8 x i8> %load, i32 0
|
||||
%elt1 = extractelement <8 x i8> %load, i32 1
|
||||
%elt4 = extractelement <8 x i8> %load, i32 4
|
||||
%elt5 = extractelement <8 x i8> %load, i32 5
|
||||
store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_45:
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 4{{$}}
|
||||
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}}
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
||||
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
|
||||
%load = load <8 x i8>, <8 x i8> addrspace(4)* null
|
||||
%elt4 = extractelement <8 x i8> %load, i32 4
|
||||
%elt5 = extractelement <8 x i8> %load, i32 5
|
||||
store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: ought to be able to eliminate high half of load
|
||||
; GCN-LABEL: {{^}}reduce_load_vector_v16i8_extract_0145:
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_load_dwordx4
|
||||
; GCN-NOT: {{s|buffer|flat|global}}_load_
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
||||
define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
|
||||
%load = load <16 x i8>, <16 x i8> addrspace(4)* null
|
||||
%elt0 = extractelement <16 x i8> %load, i32 0
|
||||
%elt1 = extractelement <16 x i8> %load, i32 1
|
||||
%elt4 = extractelement <16 x i8> %load, i32 4
|
||||
%elt5 = extractelement <16 x i8> %load, i32 5
|
||||
store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
|
||||
store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
Loading…
Reference in New Issue