forked from OSchip/llvm-project
[AMDGPU] Add merging into S_BUFFER_LOAD_DWORDX8_IMM
Extend SILoadStoreOptimizer to merge into DWORDX8 variant of S_BUFFER_LOAD. Merging into DWORDX2 and DWORDX4 variants is handled already. Differential Revision: https://reviews.llvm.org/D108909
This commit is contained in:
parent
2f0750dd2e
commit
30d6c39bca
|
@ -303,6 +303,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
|
|||
return 2;
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
|
||||
return 4;
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
|
||||
return 8;
|
||||
case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
|
||||
case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
|
||||
case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
|
||||
|
@ -372,6 +374,7 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
|
|||
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
|
||||
return S_BUFFER_LOAD_IMM;
|
||||
case AMDGPU::DS_READ_B32:
|
||||
case AMDGPU::DS_READ_B32_gfx9:
|
||||
|
@ -413,6 +416,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
|
|||
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
|
||||
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
|
||||
}
|
||||
}
|
||||
|
@ -463,6 +467,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
|
|||
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
|
||||
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
|
||||
Result.SBase = true;
|
||||
return Result;
|
||||
case AMDGPU::DS_READ_B32:
|
||||
|
@ -857,6 +862,7 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
|
|||
return false;
|
||||
case 2:
|
||||
case 4:
|
||||
case 8:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -1523,45 +1529,62 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
|
|||
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
|
||||
case 4:
|
||||
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
|
||||
case 8:
|
||||
return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
|
||||
}
|
||||
case MIMG:
|
||||
assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
|
||||
assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
|
||||
"No overlaps");
|
||||
return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<unsigned, unsigned>
|
||||
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
|
||||
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
|
||||
const CombineInfo &Paired) {
|
||||
|
||||
if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
|
||||
return std::make_pair(0, 0);
|
||||
assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
|
||||
|
||||
bool ReverseOrder;
|
||||
if (CI.InstClass == MIMG) {
|
||||
assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
|
||||
"No overlaps");
|
||||
assert(
|
||||
(countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
|
||||
"No overlaps");
|
||||
ReverseOrder = CI.DMask > Paired.DMask;
|
||||
} else
|
||||
ReverseOrder = CI.Offset > Paired.Offset;
|
||||
|
||||
static const unsigned Idxs[4][4] = {
|
||||
{AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
|
||||
{AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
|
||||
{AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
|
||||
{AMDGPU::sub3, 0, 0, 0},
|
||||
};
|
||||
unsigned Idx0;
|
||||
unsigned Idx1;
|
||||
|
||||
assert(CI.Width >= 1 && CI.Width <= 3);
|
||||
assert(Paired.Width >= 1 && Paired.Width <= 3);
|
||||
if (CI.Width + Paired.Width > 4) {
|
||||
assert(CI.Width == 4 && Paired.Width == 4);
|
||||
|
||||
if (ReverseOrder) {
|
||||
Idx1 = Idxs[0][Paired.Width - 1];
|
||||
Idx0 = Idxs[Paired.Width][CI.Width - 1];
|
||||
if (ReverseOrder) {
|
||||
Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
|
||||
Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
|
||||
} else {
|
||||
Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
|
||||
Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
|
||||
}
|
||||
} else {
|
||||
Idx0 = Idxs[0][CI.Width - 1];
|
||||
Idx1 = Idxs[CI.Width][Paired.Width - 1];
|
||||
static const unsigned Idxs[4][4] = {
|
||||
{AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
|
||||
{AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
|
||||
{AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
|
||||
{AMDGPU::sub3, 0, 0, 0},
|
||||
};
|
||||
|
||||
assert(CI.Width >= 1 && CI.Width <= 3);
|
||||
assert(Paired.Width >= 1 && Paired.Width <= 3);
|
||||
|
||||
if (ReverseOrder) {
|
||||
Idx1 = Idxs[0][Paired.Width - 1];
|
||||
Idx0 = Idxs[Paired.Width][CI.Width - 1];
|
||||
} else {
|
||||
Idx0 = Idxs[0][CI.Width - 1];
|
||||
Idx1 = Idxs[CI.Width][Paired.Width - 1];
|
||||
}
|
||||
}
|
||||
|
||||
return std::make_pair(Idx0, Idx1);
|
||||
|
@ -2134,7 +2157,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
|
|||
MachineBasicBlock::iterator NewMI =
|
||||
mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
|
||||
CI.setMI(NewMI, *TII, *STM);
|
||||
OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
|
||||
OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
|
||||
break;
|
||||
}
|
||||
case BUFFER_LOAD: {
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
|
||||
|
||||
# CHECK-LABEL: name: merge_s_buffer_load_x2
|
||||
# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
|
||||
|
||||
name: merge_s_buffer_load_x2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
|
||||
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
|
||||
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
|
||||
|
||||
S_ENDPGM 0
|
||||
...
|
||||
---
|
||||
|
||||
# CHECK-LABEL: name: merge_s_buffer_load_x4
|
||||
# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
|
||||
name: merge_s_buffer_load_x4
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
|
||||
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
|
||||
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
|
||||
%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
|
||||
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
|
||||
|
||||
S_ENDPGM 0
|
||||
...
|
||||
---
|
||||
|
||||
# CHECK-LABEL: name: merge_s_buffer_load_x8
|
||||
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4)
|
||||
name: merge_s_buffer_load_x8
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
|
||||
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
|
||||
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
|
||||
%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
|
||||
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
|
||||
%5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
|
||||
%6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
|
||||
%7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32))
|
||||
%8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32))
|
||||
|
||||
S_ENDPGM 0
|
||||
...
|
||||
---
|
||||
|
||||
# CHECK-LABEL: name: merge_s_buffer_load_x8_reordered
|
||||
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4)
|
||||
name: merge_s_buffer_load_x8_reordered
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
|
||||
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
|
||||
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
|
||||
%3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
|
||||
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32))
|
||||
%5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
|
||||
%6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
|
||||
%7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
|
||||
%8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32))
|
||||
|
||||
S_ENDPGM 0
|
||||
...
|
||||
---
|
||||
|
||||
# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2
|
||||
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 8)
|
||||
name: merge_s_buffer_load_x8_out_of_x2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
|
||||
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64))
|
||||
%2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
|
||||
%3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64))
|
||||
%4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
|
||||
|
||||
S_ENDPGM 0
|
||||
...
|
||||
---
|
||||
|
||||
# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4
|
||||
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)
|
||||
name: merge_s_buffer_load_x8_out_of_x4
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
|
||||
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
|
||||
%2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
|
||||
|
||||
S_ENDPGM 0
|
||||
...
|
||||
---
|
||||
|
||||
|
||||
# CHECK-LABEL: name: merge_s_buffer_load_x8_mixed
|
||||
# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)
|
||||
name: merge_s_buffer_load_x8_mixed
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
|
||||
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
%1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
|
||||
%2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
|
||||
%3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
|
||||
%4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
|
||||
|
||||
S_ENDPGM 0
|
||||
...
|
||||
---
|
Loading…
Reference in New Issue