forked from OSchip/llvm-project
[AMDGPU] Prefer SplitVectorLoad/Store over expandUnalignedLoad/Store
ExpandUnalignedLoad/Store can sometimes produce unnecessary copies to temporary stack slot. We should prefer splitting vectors if possible. Differential Revision: https://reviews.llvm.org/D88882
This commit is contained in:
parent
380087e6c9
commit
7c88d13fd1
|
@ -8014,13 +8014,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
|
||||
"Custom lowering for non-i32 vectors hasn't been implemented.");
|
||||
|
||||
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
|
||||
MemVT, *Load->getMemOperand())) {
|
||||
SDValue Ops[2];
|
||||
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
|
||||
return DAG.getMergeValues(Ops, DL);
|
||||
}
|
||||
|
||||
unsigned Alignment = Load->getAlignment();
|
||||
unsigned AS = Load->getAddressSpace();
|
||||
if (Subtarget->hasLDSMisalignedBug() &&
|
||||
|
@ -8132,6 +8125,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
return SplitVectorLoad(Op, DAG);
|
||||
}
|
||||
}
|
||||
|
||||
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
|
||||
MemVT, *Load->getMemOperand())) {
|
||||
SDValue Ops[2];
|
||||
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
|
||||
return DAG.getMergeValues(Ops, DL);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
@ -8537,11 +8538,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|||
assert(VT.isVector() &&
|
||||
Store->getValue().getValueType().getScalarType() == MVT::i32);
|
||||
|
||||
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
|
||||
VT, *Store->getMemOperand())) {
|
||||
return expandUnalignedStore(Store, DAG);
|
||||
}
|
||||
|
||||
unsigned AS = Store->getAddressSpace();
|
||||
if (Subtarget->hasLDSMisalignedBug() &&
|
||||
AS == AMDGPUAS::FLAT_ADDRESS &&
|
||||
|
@ -8566,6 +8562,11 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|||
// v3 stores not supported on SI.
|
||||
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
|
||||
return SplitVectorStore(Op, DAG);
|
||||
|
||||
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
|
||||
VT, *Store->getMemOperand()))
|
||||
return expandUnalignedStore(Store, DAG);
|
||||
|
||||
return SDValue();
|
||||
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
|
||||
switch (Subtarget->getMaxPrivateElementSize()) {
|
||||
|
@ -8605,6 +8606,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|||
return SplitVectorStore(Op, DAG);
|
||||
}
|
||||
|
||||
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
|
||||
VT, *Store->getMemOperand())) {
|
||||
if (VT.isVector())
|
||||
return SplitVectorStore(Op, DAG);
|
||||
return expandUnalignedStore(Store, DAG);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
} else {
|
||||
llvm_unreachable("unhandled address space");
|
||||
|
|
|
@ -7,35 +7,15 @@
|
|||
define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %arg2) {
|
||||
; CHECK-LABEL: test:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s8, s4
|
||||
; CHECK-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
|
||||
; CHECK-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
|
||||
; CHECK-NEXT: s_mov_b32 s6, -1
|
||||
; CHECK-NEXT: s_mov_b32 s7, 0xe8f000
|
||||
; CHECK-NEXT: s_add_u32 s4, s4, s8
|
||||
; CHECK-NEXT: s_addc_u32 s5, s5, 0
|
||||
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v0
|
||||
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 8, v0
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 12, v0
|
||||
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
||||
; CHECK-NEXT: s_mov_b32 m0, -1
|
||||
; CHECK-NEXT: ds_read_b32 v1, v1
|
||||
; CHECK-NEXT: ds_read_b32 v2, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 4, v0
|
||||
; CHECK-NEXT: ds_read_b32 v2, v1
|
||||
; CHECK-NEXT: ds_read_b32 v1, v4
|
||||
; CHECK-NEXT: ds_read_b32 v3, v3
|
||||
; CHECK-NEXT: ds_read_b32 v0, v0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; CHECK-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:28
|
||||
; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:24
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; CHECK-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16
|
||||
; CHECK-NEXT: s_waitcnt expcnt(1)
|
||||
; CHECK-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28
|
||||
; CHECK-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24
|
||||
; CHECK-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20
|
||||
; CHECK-NEXT: s_waitcnt expcnt(0)
|
||||
; CHECK-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: exp mrt0 off, off, off, off
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
||||
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
|
||||
|
@ -50,42 +30,25 @@ define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %ar
|
|||
define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) {
|
||||
; CHECK-LABEL: test_2:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
||||
; CHECK-NEXT: s_mov_b32 s10, -1
|
||||
; CHECK-NEXT: s_mov_b32 s11, 0xe8f000
|
||||
; CHECK-NEXT: s_add_u32 s8, s8, s5
|
||||
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
||||
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 28, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 12, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1
|
||||
; CHECK-NEXT: s_mov_b32 m0, -1
|
||||
; CHECK-NEXT: ds_read_b32 v4, v2
|
||||
; CHECK-NEXT: ds_read_b32 v5, v3
|
||||
; CHECK-NEXT: ds_read_b32 v3, v3
|
||||
; CHECK-NEXT: ds_read_b32 v2, v6
|
||||
; CHECK-NEXT: ds_read_b32 v3, v7
|
||||
; CHECK-NEXT: ds_read_b32 v9, v7
|
||||
; CHECK-NEXT: ds_read_b32 v8, v8
|
||||
; CHECK-NEXT: ds_read_b32 v9, v9
|
||||
; CHECK-NEXT: ds_read_b32 v7, v10
|
||||
; CHECK-NEXT: ds_read_b32 v6, v1
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(6)
|
||||
; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:28
|
||||
; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:24
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20
|
||||
; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; CHECK-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:44
|
||||
; CHECK-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:40
|
||||
; CHECK-NEXT: ds_read_b32 v5, v5
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; CHECK-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:36
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:32
|
||||
; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
|
||||
; CHECK-NEXT: s_endpgm
|
||||
%load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4
|
||||
|
@ -99,65 +62,42 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3,
|
|||
define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, <6 x float> addrspace(3)* %arg5, <6 x float> addrspace(3)* %arg6) {
|
||||
; CHECK-LABEL: test_3:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
||||
; CHECK-NEXT: s_mov_b32 s10, -1
|
||||
; CHECK-NEXT: s_mov_b32 s11, 0xe8f000
|
||||
; CHECK-NEXT: s_add_u32 s8, s8, s6
|
||||
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
||||
; CHECK-NEXT: s_mov_b32 s7, s5
|
||||
; CHECK-NEXT: s_mov_b32 s6, s4
|
||||
; CHECK-NEXT: s_mov_b32 s5, s3
|
||||
; CHECK-NEXT: s_mov_b32 s4, s2
|
||||
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 8, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 16, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1
|
||||
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v9, s0
|
||||
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 8, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 16, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 4, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v13, vcc, 16, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v14, vcc, 20, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
|
||||
; CHECK-NEXT: s_mov_b32 m0, -1
|
||||
; CHECK-NEXT: ds_read_b32 v5, v0
|
||||
; CHECK-NEXT: ds_read_b32 v6, v3
|
||||
; CHECK-NEXT: ds_read_b32 v4, v4
|
||||
; CHECK-NEXT: ds_read_b32 v8, v8
|
||||
; CHECK-NEXT: ds_read_b32 v7, v7
|
||||
; CHECK-NEXT: ds_read_b32 v3, v1
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
|
||||
; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:44
|
||||
; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:40
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:36
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32
|
||||
; CHECK-NEXT: ds_read_b32 v5, v4
|
||||
; CHECK-NEXT: ds_read_b32 v4, v7
|
||||
; CHECK-NEXT: ds_read_b32 v1, v8
|
||||
; CHECK-NEXT: ds_read_b32 v6, v6
|
||||
; CHECK-NEXT: ds_read_b32 v0, v0
|
||||
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
|
||||
; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
|
||||
; CHECK-NEXT: ds_read_b32 v0, v10
|
||||
; CHECK-NEXT: ds_read_b32 v1, v11
|
||||
; CHECK-NEXT: s_waitcnt expcnt(1)
|
||||
; CHECK-NEXT: ds_read_b32 v3, v12
|
||||
; CHECK-NEXT: ds_read_b32 v4, v13
|
||||
; CHECK-NEXT: ds_read_b32 v2, v2
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
|
||||
; CHECK-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28
|
||||
; CHECK-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
|
||||
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16
|
||||
; CHECK-NEXT: s_waitcnt expcnt(1)
|
||||
; CHECK-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28
|
||||
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
|
||||
; CHECK-NEXT: s_waitcnt expcnt(0)
|
||||
; CHECK-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24
|
||||
; CHECK-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20
|
||||
; CHECK-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16
|
||||
; CHECK-NEXT: ds_read_b32 v5, v14
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: ds_read_b32 v0, v2
|
||||
; CHECK-NEXT: ds_read_b32 v2, v12
|
||||
; CHECK-NEXT: ds_read_b32 v1, v7
|
||||
; CHECK-NEXT: ds_read_b32 v5, v8
|
||||
; CHECK-NEXT: ds_read_b32 v3, v11
|
||||
; CHECK-NEXT: ds_read_b32 v4, v10
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
|
||||
; CHECK-NEXT: exp mrt0 off, off, off, off
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
|
||||
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
|
||||
|
|
Loading…
Reference in New Issue