2017-08-08 08:47:13 +08:00
|
|
|
//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
|
2014-10-11 06:01:59 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This pass tries to fuse DS instructions with close by immediate offsets.
|
|
|
|
// This will fuse operations such as
|
|
|
|
// ds_read_b32 v0, v2 offset:16
|
|
|
|
// ds_read_b32 v1, v2 offset:32
|
|
|
|
// ==>
|
|
|
|
// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
|
|
|
|
//
|
2017-11-28 16:42:46 +08:00
|
|
|
// The same is done for certain SMEM and VMEM opcodes, e.g.:
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
// s_buffer_load_dword s4, s[0:3], 4
|
|
|
|
// s_buffer_load_dword s5, s[0:3], 8
|
|
|
|
// ==>
|
|
|
|
// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
|
|
|
|
//
|
2014-10-11 06:01:59 +08:00
|
|
|
//
|
|
|
|
// Future improvements:
|
|
|
|
//
|
|
|
|
// - This currently relies on the scheduler to place loads and stores next to
|
|
|
|
// each other, and then only merges adjacent pairs of instructions. It would
|
|
|
|
// be good to be more flexible with interleaved instructions, and possibly run
|
|
|
|
// before scheduling. It currently missing stores of constants because loading
|
|
|
|
// the constant into the data register is placed between the stores, although
|
|
|
|
// this is arguably a scheduling problem.
|
|
|
|
//
|
|
|
|
// - Live interval recomputing seems inefficient. This currently only matches
|
|
|
|
// one pair, and recomputes live intervals and moves on to the next pair. It
|
2016-03-29 23:15:44 +08:00
|
|
|
// would be better to compute a list of all merges that need to occur.
|
2014-10-11 06:01:59 +08:00
|
|
|
//
|
|
|
|
// - With a list of instructions to process, we can also merge more. If a
|
|
|
|
// cluster of loads have offsets that are too large to fit in the 8-bit
|
|
|
|
// offsets, but are close enough to fit in the 8 bits, we can add to the base
|
|
|
|
// pointer and use the new reduced offsets.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
2016-06-24 14:30:11 +08:00
|
|
|
#include "AMDGPUSubtarget.h"
|
2014-10-11 06:01:59 +08:00
|
|
|
#include "SIInstrInfo.h"
|
|
|
|
#include "SIRegisterInfo.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
#include "llvm/ADT/StringRef.h"
|
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
|
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
2014-10-11 06:01:59 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
2014-10-11 06:01:59 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
2014-10-11 06:01:59 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/IR/DebugLoc.h"
|
|
|
|
#include "llvm/Pass.h"
|
2014-10-11 06:01:59 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2017-01-21 08:53:49 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2015-03-24 03:32:43 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2017-08-08 08:47:13 +08:00
|
|
|
#include <algorithm>
|
2017-01-21 08:53:49 +08:00
|
|
|
#include <cassert>
|
2017-08-08 08:47:13 +08:00
|
|
|
#include <cstdlib>
|
2017-01-21 08:53:49 +08:00
|
|
|
#include <iterator>
|
|
|
|
#include <utility>
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "si-load-store-opt"
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
class SILoadStoreOptimizer : public MachineFunctionPass {
|
2017-11-09 09:52:30 +08:00
|
|
|
enum InstClassEnum {
|
|
|
|
DS_READ_WRITE,
|
|
|
|
S_BUFFER_LOAD_IMM,
|
|
|
|
BUFFER_LOAD_OFFEN,
|
2017-11-09 09:52:36 +08:00
|
|
|
BUFFER_LOAD_OFFSET,
|
2017-11-09 09:52:55 +08:00
|
|
|
BUFFER_STORE_OFFEN,
|
|
|
|
BUFFER_STORE_OFFSET,
|
2017-11-09 09:52:30 +08:00
|
|
|
};
|
|
|
|
|
2017-10-10 16:30:53 +08:00
|
|
|
struct CombineInfo {
|
2017-04-14 01:53:07 +08:00
|
|
|
MachineBasicBlock::iterator I;
|
|
|
|
MachineBasicBlock::iterator Paired;
|
|
|
|
unsigned EltSize;
|
|
|
|
unsigned Offset0;
|
|
|
|
unsigned Offset1;
|
|
|
|
unsigned BaseOff;
|
2017-11-09 09:52:30 +08:00
|
|
|
InstClassEnum InstClass;
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
bool GLC0;
|
|
|
|
bool GLC1;
|
2017-11-09 09:52:30 +08:00
|
|
|
bool SLC0;
|
|
|
|
bool SLC1;
|
2017-04-14 01:53:07 +08:00
|
|
|
bool UseST64;
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
bool IsX2;
|
2017-04-14 01:53:07 +08:00
|
|
|
SmallVector<MachineInstr*, 8> InstsToMove;
|
2017-08-08 08:47:13 +08:00
|
|
|
};
|
2017-04-14 01:53:07 +08:00
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
private:
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
const SISubtarget *STM = nullptr;
|
2017-01-21 08:53:49 +08:00
|
|
|
const SIInstrInfo *TII = nullptr;
|
|
|
|
const SIRegisterInfo *TRI = nullptr;
|
|
|
|
MachineRegisterInfo *MRI = nullptr;
|
|
|
|
AliasAnalysis *AA = nullptr;
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
unsigned CreatedX2;
|
2014-10-11 06:01:59 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
static bool offsetsCanBeCombined(CombineInfo &CI);
|
2014-10-11 06:01:59 +08:00
|
|
|
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
bool findMatchingInst(CombineInfo &CI);
|
2017-11-29 08:55:57 +08:00
|
|
|
|
|
|
|
unsigned read2Opcode(unsigned EltSize) const;
|
|
|
|
unsigned read2ST64Opcode(unsigned EltSize) const;
|
2017-04-14 01:53:07 +08:00
|
|
|
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
|
2017-11-29 08:55:57 +08:00
|
|
|
|
|
|
|
unsigned write2Opcode(unsigned EltSize) const;
|
|
|
|
unsigned write2ST64Opcode(unsigned EltSize) const;
|
2017-04-14 01:53:07 +08:00
|
|
|
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
|
2017-11-09 09:52:36 +08:00
|
|
|
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
|
2017-11-09 09:52:55 +08:00
|
|
|
unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
|
|
|
|
bool &IsOffen) const;
|
|
|
|
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
SILoadStoreOptimizer() : MachineFunctionPass(ID) {
|
2014-10-11 06:01:59 +08:00
|
|
|
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool optimizeBlock(MachineBasicBlock &MBB);
|
|
|
|
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
|
2018-01-23 05:46:43 +08:00
|
|
|
StringRef getPassName() const override { return "SI Load Store Optimizer"; }
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.setPreservesCFG();
|
2016-08-30 03:15:22 +08:00
|
|
|
AU.addRequired<AAResultsWrapperPass>();
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2017-01-21 08:53:49 +08:00
|
|
|
} // end anonymous namespace.
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
|
2018-01-23 05:46:43 +08:00
|
|
|
"SI Load Store Optimizer", false, false)
|
2016-08-30 03:15:22 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
2014-10-11 06:01:59 +08:00
|
|
|
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
|
2018-01-23 05:46:43 +08:00
|
|
|
"SI Load Store Optimizer", false, false)
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
char SILoadStoreOptimizer::ID = 0;
|
|
|
|
|
|
|
|
char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
FunctionPass *llvm::createSILoadStoreOptimizerPass() {
|
|
|
|
return new SILoadStoreOptimizer();
|
2014-10-11 06:01:59 +08:00
|
|
|
}
|
|
|
|
|
2016-08-30 03:15:22 +08:00
|
|
|
static void moveInstsAfter(MachineBasicBlock::iterator I,
|
|
|
|
ArrayRef<MachineInstr*> InstsToMove) {
|
|
|
|
MachineBasicBlock *MBB = I->getParent();
|
|
|
|
++I;
|
|
|
|
for (MachineInstr *MI : InstsToMove) {
|
|
|
|
MI->removeFromParent();
|
|
|
|
MBB->insert(I, MI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-31 09:53:09 +08:00
|
|
|
static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
|
2018-02-08 09:56:14 +08:00
|
|
|
for (const MachineOperand &Def : MI.operands()) {
|
|
|
|
if (Def.isReg() && Def.isDef())
|
|
|
|
Defs.insert(Def.getReg());
|
|
|
|
}
|
2016-08-30 03:15:22 +08:00
|
|
|
}
|
|
|
|
|
2017-01-21 08:53:49 +08:00
|
|
|
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
|
|
|
|
MachineBasicBlock::iterator B,
|
|
|
|
const SIInstrInfo *TII,
|
|
|
|
AliasAnalysis * AA) {
|
2017-08-31 09:53:09 +08:00
|
|
|
// RAW or WAR - cannot reorder
|
|
|
|
// WAW - cannot reorder
|
|
|
|
// RAR - safe to reorder
|
|
|
|
return !(A->mayStore() || B->mayStore()) ||
|
|
|
|
TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
|
2016-11-03 22:37:13 +08:00
|
|
|
}
|
|
|
|
|
2016-10-27 16:15:07 +08:00
|
|
|
// Add MI and its defs to the lists if MI reads one of the defs that are
|
|
|
|
// already in the list. Returns true in that case.
|
|
|
|
static bool
|
|
|
|
addToListsIfDependent(MachineInstr &MI,
|
2017-08-31 09:53:09 +08:00
|
|
|
DenseSet<unsigned> &Defs,
|
2016-10-27 16:15:07 +08:00
|
|
|
SmallVectorImpl<MachineInstr*> &Insts) {
|
2017-08-31 09:53:09 +08:00
|
|
|
for (MachineOperand &Use : MI.operands()) {
|
|
|
|
// If one of the defs is read, then there is a use of Def between I and the
|
|
|
|
// instruction that I will potentially be merged with. We will need to move
|
|
|
|
// this instruction after the merged instructions.
|
|
|
|
|
|
|
|
if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
|
2016-10-27 16:15:07 +08:00
|
|
|
Insts.push_back(&MI);
|
|
|
|
addDefsToList(MI, Defs);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-08-30 03:15:22 +08:00
|
|
|
static bool
|
|
|
|
canMoveInstsAcrossMemOp(MachineInstr &MemOp,
|
|
|
|
ArrayRef<MachineInstr*> InstsToMove,
|
|
|
|
const SIInstrInfo *TII,
|
|
|
|
AliasAnalysis *AA) {
|
|
|
|
assert(MemOp.mayLoadOrStore());
|
|
|
|
|
|
|
|
for (MachineInstr *InstToMove : InstsToMove) {
|
|
|
|
if (!InstToMove->mayLoadOrStore())
|
|
|
|
continue;
|
2016-11-03 22:37:13 +08:00
|
|
|
if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
|
|
|
|
return false;
|
2016-08-30 03:15:22 +08:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
AMDGPU: Do not combine loads/store across physreg defs
Summary:
Since this pass operates on machine SSA form, this should only really
affect M0 in practice.
Fixes various piglit variable-indexing/vs-varying-array-mat4-index-*
Change-Id: Ib2a1dc3a8d7b08225a8da49a86f533faa0986aa8
Fixes: r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4")
Reviewers: arsenm, mareko, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D40343
llvm-svn: 325677
2018-02-21 21:31:35 +08:00
|
|
|
static bool
|
|
|
|
hasPhysRegDef(MachineInstr &MI) {
|
|
|
|
for (const MachineOperand &Def : MI.defs()) {
|
|
|
|
if (Def.isReg() &&
|
|
|
|
TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
|
2014-10-11 06:01:59 +08:00
|
|
|
// XXX - Would the same offset be OK? Is there any reason this would happen or
|
|
|
|
// be useful?
|
2017-04-14 01:53:07 +08:00
|
|
|
if (CI.Offset0 == CI.Offset1)
|
2014-10-11 06:12:32 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
// This won't be valid if the offset isn't aligned.
|
2017-04-14 01:53:07 +08:00
|
|
|
if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
|
2014-10-11 06:12:32 +08:00
|
|
|
return false;
|
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
|
|
|
|
unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
|
|
|
|
CI.UseST64 = false;
|
|
|
|
CI.BaseOff = 0;
|
|
|
|
|
2017-11-09 09:52:55 +08:00
|
|
|
// Handle SMEM and VMEM instructions.
|
|
|
|
if (CI.InstClass != DS_READ_WRITE) {
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
unsigned Diff = CI.IsX2 ? 2 : 1;
|
|
|
|
return (EltOffset0 + Diff == EltOffset1 ||
|
|
|
|
EltOffset1 + Diff == EltOffset0) &&
|
2017-11-09 09:52:30 +08:00
|
|
|
CI.GLC0 == CI.GLC1 &&
|
|
|
|
(CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
}
|
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
// If the offset in elements doesn't fit in 8-bits, we might be able to use
|
|
|
|
// the stride 64 versions.
|
|
|
|
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
|
|
|
|
isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
|
|
|
|
CI.Offset0 = EltOffset0 / 64;
|
|
|
|
CI.Offset1 = EltOffset1 / 64;
|
|
|
|
CI.UseST64 = true;
|
|
|
|
return true;
|
|
|
|
}
|
2014-10-11 06:12:32 +08:00
|
|
|
|
|
|
|
// Check if the new offsets fit in the reduced 8-bit range.
|
2017-04-14 01:53:07 +08:00
|
|
|
if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
|
|
|
|
CI.Offset0 = EltOffset0;
|
|
|
|
CI.Offset1 = EltOffset1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to shift base address to decrease offsets.
|
|
|
|
unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
|
|
|
|
CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
|
|
|
|
|
|
|
|
if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
|
|
|
|
CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
|
|
|
|
CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
|
|
|
|
CI.UseST64 = true;
|
2014-10-11 06:12:32 +08:00
|
|
|
return true;
|
2017-04-14 01:53:07 +08:00
|
|
|
}
|
2014-10-11 06:12:32 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
if (isUInt<8>(OffsetDiff)) {
|
|
|
|
CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
|
|
|
|
CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
|
|
|
|
return true;
|
|
|
|
}
|
2014-10-11 06:12:32 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
return false;
|
2014-10-11 06:01:59 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
2017-08-31 09:53:09 +08:00
|
|
|
MachineBasicBlock *MBB = CI.I->getParent();
|
|
|
|
MachineBasicBlock::iterator E = MBB->end();
|
2017-04-14 01:53:07 +08:00
|
|
|
MachineBasicBlock::iterator MBBI = CI.I;
|
2017-08-30 11:26:18 +08:00
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
unsigned AddrOpName[3] = {0};
|
|
|
|
int AddrIdx[3];
|
|
|
|
const MachineOperand *AddrReg[3];
|
|
|
|
unsigned NumAddresses = 0;
|
|
|
|
|
|
|
|
switch (CI.InstClass) {
|
|
|
|
case DS_READ_WRITE:
|
|
|
|
AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
|
|
|
|
break;
|
|
|
|
case S_BUFFER_LOAD_IMM:
|
|
|
|
AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
|
|
|
|
break;
|
|
|
|
case BUFFER_LOAD_OFFEN:
|
2017-11-09 09:52:55 +08:00
|
|
|
case BUFFER_STORE_OFFEN:
|
2017-11-09 09:52:30 +08:00
|
|
|
AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
|
|
|
|
AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
|
|
|
|
AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
|
|
|
|
break;
|
2017-11-09 09:52:36 +08:00
|
|
|
case BUFFER_LOAD_OFFSET:
|
2017-11-09 09:52:55 +08:00
|
|
|
case BUFFER_STORE_OFFSET:
|
2017-11-09 09:52:36 +08:00
|
|
|
AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
|
|
|
|
AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
|
|
|
|
break;
|
2017-11-09 09:52:30 +08:00
|
|
|
}
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
for (unsigned i = 0; i < NumAddresses; i++) {
|
|
|
|
AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
|
|
|
|
AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
|
2017-08-30 11:26:18 +08:00
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
// We only ever merge operations with the same base address register, so don't
|
|
|
|
// bother scanning forward if there are no other uses.
|
|
|
|
if (AddrReg[i]->isReg() &&
|
|
|
|
(TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
|
|
|
|
MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
|
|
|
|
return false;
|
|
|
|
}
|
2017-08-30 11:26:18 +08:00
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
++MBBI;
|
|
|
|
|
2017-08-31 09:53:09 +08:00
|
|
|
DenseSet<unsigned> DefsToMove;
|
2017-04-14 01:53:07 +08:00
|
|
|
addDefsToList(*CI.I, DefsToMove);
|
2016-08-30 03:15:22 +08:00
|
|
|
|
|
|
|
for ( ; MBBI != E; ++MBBI) {
|
2017-04-14 01:53:07 +08:00
|
|
|
if (MBBI->getOpcode() != CI.I->getOpcode()) {
|
2016-08-30 03:15:22 +08:00
|
|
|
// This is not a matching DS instruction, but we can keep looking as
|
|
|
|
// long as one of these conditions are met:
|
|
|
|
// 1. It is safe to move I down past MBBI.
|
|
|
|
// 2. It is safe to move MBBI down past the instruction that I will
|
|
|
|
// be merged into.
|
|
|
|
|
2017-08-30 05:25:51 +08:00
|
|
|
if (MBBI->hasUnmodeledSideEffects()) {
|
2016-08-30 03:15:22 +08:00
|
|
|
// We can't re-order this instruction with respect to other memory
|
2017-08-30 05:25:51 +08:00
|
|
|
// operations, so we fail both conditions mentioned above.
|
2017-04-14 01:53:07 +08:00
|
|
|
return false;
|
2017-08-30 05:25:51 +08:00
|
|
|
}
|
2014-10-11 06:01:59 +08:00
|
|
|
|
AMDGPU: Do not combine loads/store across physreg defs
Summary:
Since this pass operates on machine SSA form, this should only really
affect M0 in practice.
Fixes various piglit variable-indexing/vs-varying-array-mat4-index-*
Change-Id: Ib2a1dc3a8d7b08225a8da49a86f533faa0986aa8
Fixes: r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4")
Reviewers: arsenm, mareko, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D40343
llvm-svn: 325677
2018-02-21 21:31:35 +08:00
|
|
|
if (hasPhysRegDef(*MBBI)) {
|
|
|
|
// We could re-order this instruction in theory, but it would require
|
|
|
|
// tracking physreg defs and uses. This should only affect M0 in
|
|
|
|
// practice.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-08-30 03:15:22 +08:00
|
|
|
if (MBBI->mayLoadOrStore() &&
|
AMDGPU: Consider memory dependencies with moved instructions in SILoadStoreOptimizer
Summary:
This bug seems to have gone unnoticed because critical cases with LDS
instructions are eliminated by the peephole optimizer.
However, equivalent situations arise with buffer loads and stores
as well, so this fixes regressions since r317751 ("AMDGPU: Merge
S_BUFFER_LOAD_DWORD_IMM into x2, x4").
Fixes at least:
KHR-GL45.shader_storage_buffer_object.basic-operations-case1-cs
KHR-GL45.cull_distance.functional
piglit tes-input-gl_ClipDistance.shader_test
... and probably more
Change-Id: I0e371536288eb8e6afeaa241a185266fd45d129d
Reviewers: arsenm, mareko, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D40303
llvm-svn: 318829
2017-11-22 20:25:21 +08:00
|
|
|
(!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
|
|
|
|
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
|
2016-08-30 03:15:22 +08:00
|
|
|
// We fail condition #1, but we may still be able to satisfy condition
|
|
|
|
// #2. Add this instruction to the move list and then we will check
|
|
|
|
// if condition #2 holds once we have selected the matching instruction.
|
2017-04-14 01:53:07 +08:00
|
|
|
CI.InstsToMove.push_back(&*MBBI);
|
2016-08-30 03:15:22 +08:00
|
|
|
addDefsToList(*MBBI, DefsToMove);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// When we match I with another DS instruction we will be moving I down
|
|
|
|
// to the location of the matched instruction any uses of I will need to
|
|
|
|
// be moved down as well.
|
2017-04-14 01:53:07 +08:00
|
|
|
addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
|
2016-08-30 03:15:22 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't merge volatiles.
|
|
|
|
if (MBBI->hasOrderedMemoryRef())
|
2017-04-14 01:53:07 +08:00
|
|
|
return false;
|
2016-08-30 03:15:22 +08:00
|
|
|
|
2016-10-27 16:15:07 +08:00
|
|
|
// Handle a case like
|
|
|
|
// DS_WRITE_B32 addr, v, idx0
|
|
|
|
// w = DS_READ_B32 addr, idx0
|
|
|
|
// DS_WRITE_B32 addr, f(w), idx1
|
|
|
|
// where the DS_READ_B32 ends up in InstsToMove and therefore prevents
|
|
|
|
// merging of the two writes.
|
2017-04-14 01:53:07 +08:00
|
|
|
if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
|
2016-10-27 16:15:07 +08:00
|
|
|
continue;
|
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
bool Match = true;
|
|
|
|
for (unsigned i = 0; i < NumAddresses; i++) {
|
|
|
|
const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
|
|
|
|
|
|
|
|
if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
|
|
|
|
if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
|
|
|
|
AddrReg[i]->getImm() != AddrRegNext.getImm()) {
|
|
|
|
Match = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check same base pointer. Be careful of subregisters, which can occur with
|
|
|
|
// vectors of pointers.
|
|
|
|
if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
|
|
|
|
AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
|
|
|
|
Match = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2016-08-30 03:15:22 +08:00
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
if (Match) {
|
2017-04-14 01:53:07 +08:00
|
|
|
int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
|
2016-08-30 03:15:22 +08:00
|
|
|
AMDGPU::OpName::offset);
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
|
|
|
|
CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
|
2017-04-14 01:53:07 +08:00
|
|
|
CI.Paired = MBBI;
|
2016-08-30 03:15:22 +08:00
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
if (CI.InstClass == DS_READ_WRITE) {
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
CI.Offset0 &= 0xffff;
|
|
|
|
CI.Offset1 &= 0xffff;
|
2017-11-09 09:52:30 +08:00
|
|
|
} else {
|
|
|
|
CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
|
|
|
|
CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
|
2017-11-09 09:52:36 +08:00
|
|
|
if (CI.InstClass != S_BUFFER_LOAD_IMM) {
|
2017-11-09 09:52:30 +08:00
|
|
|
CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
|
|
|
|
CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
|
|
|
|
}
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
}
|
|
|
|
|
2016-08-30 03:15:22 +08:00
|
|
|
// Check both offsets fit in the reduced range.
|
|
|
|
// We also need to go through the list of instructions that we plan to
|
|
|
|
// move and make sure they are all safe to move down past the merged
|
|
|
|
// instruction.
|
2017-04-14 01:53:07 +08:00
|
|
|
if (offsetsCanBeCombined(CI))
|
|
|
|
if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
|
|
|
|
return true;
|
2016-08-30 03:15:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// We've found a load/store that we couldn't merge for some reason.
|
|
|
|
// We could potentially keep looking, but we'd need to make sure that
|
|
|
|
// it was safe to move I and also all the instruction in InstsToMove
|
|
|
|
// down past this instruction.
|
2017-04-14 01:53:07 +08:00
|
|
|
// check if we can move I across MBBI and if we can move all I's users
|
|
|
|
if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
|
AMDGPU: Do not combine loads/store across physreg defs
Summary:
Since this pass operates on machine SSA form, this should only really
affect M0 in practice.
Fixes various piglit variable-indexing/vs-varying-array-mat4-index-*
Change-Id: Ib2a1dc3a8d7b08225a8da49a86f533faa0986aa8
Fixes: r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4")
Reviewers: arsenm, mareko, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D40343
llvm-svn: 325677
2018-02-21 21:31:35 +08:00
|
|
|
!canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
|
|
|
|
hasPhysRegDef(*MBBI))
|
2016-11-03 22:37:13 +08:00
|
|
|
break;
|
2016-08-30 03:15:22 +08:00
|
|
|
}
|
2017-04-14 01:53:07 +08:00
|
|
|
return false;
|
2014-10-11 06:01:59 +08:00
|
|
|
}
|
|
|
|
|
2017-11-29 08:55:57 +08:00
|
|
|
unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
|
|
|
|
if (STM->ldsRequiresM0Init())
|
|
|
|
return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
|
|
|
|
return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
|
|
|
|
if (STM->ldsRequiresM0Init())
|
|
|
|
return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
|
|
|
|
|
|
|
|
return (EltSize == 4) ?
|
|
|
|
AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
|
|
|
|
}
|
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
2017-04-14 01:53:07 +08:00
|
|
|
CombineInfo &CI) {
|
|
|
|
MachineBasicBlock *MBB = CI.I->getParent();
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
// Be careful, since the addresses could be subregisters themselves in weird
|
|
|
|
// cases, like vectors of pointers.
|
2017-04-14 01:53:07 +08:00
|
|
|
const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
|
2014-10-11 06:12:32 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
|
|
|
|
const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
|
|
|
|
|
|
|
|
unsigned NewOffset0 = CI.Offset0;
|
|
|
|
unsigned NewOffset1 = CI.Offset1;
|
2017-11-29 08:55:57 +08:00
|
|
|
unsigned Opc = CI.UseST64 ?
|
|
|
|
read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
|
2017-04-14 01:53:07 +08:00
|
|
|
|
|
|
|
unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
|
|
|
|
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
|
2016-08-27 05:36:47 +08:00
|
|
|
|
|
|
|
if (NewOffset0 > NewOffset1) {
|
|
|
|
// Canonicalize the merged instruction so the smaller offset comes first.
|
|
|
|
std::swap(NewOffset0, NewOffset1);
|
|
|
|
std::swap(SubRegIdx0, SubRegIdx1);
|
|
|
|
}
|
|
|
|
|
2014-10-11 06:12:32 +08:00
|
|
|
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
|
|
|
|
(NewOffset0 != NewOffset1) &&
|
|
|
|
"Computed offset doesn't fit");
|
|
|
|
|
|
|
|
const MCInstrDesc &Read2Desc = TII->get(Opc);
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
const TargetRegisterClass *SuperRC
|
2017-04-14 01:53:07 +08:00
|
|
|
= (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
|
2014-10-11 06:01:59 +08:00
|
|
|
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
|
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
DebugLoc DL = CI.I->getDebugLoc();
|
|
|
|
|
|
|
|
unsigned BaseReg = AddrReg->getReg();
|
|
|
|
unsigned BaseRegFlags = 0;
|
|
|
|
if (CI.BaseOff) {
|
2018-01-23 05:46:43 +08:00
|
|
|
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
|
|
|
|
.addImm(CI.BaseOff);
|
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
BaseRegFlags = RegState::Kill;
|
2017-12-01 06:51:26 +08:00
|
|
|
|
2018-01-23 05:46:43 +08:00
|
|
|
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
|
|
|
|
.addReg(ImmReg)
|
2017-12-01 06:51:26 +08:00
|
|
|
.addReg(AddrReg->getReg());
|
2017-04-14 01:53:07 +08:00
|
|
|
}
|
|
|
|
|
2017-04-14 08:33:44 +08:00
|
|
|
MachineInstrBuilder Read2 =
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
|
|
|
|
.addReg(BaseReg, BaseRegFlags) // addr
|
|
|
|
.addImm(NewOffset0) // offset0
|
|
|
|
.addImm(NewOffset1) // offset1
|
|
|
|
.addImm(0) // gds
|
|
|
|
.setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
|
|
|
|
|
2016-08-30 19:50:21 +08:00
|
|
|
(void)Read2;
|
2014-10-11 06:01:59 +08:00
|
|
|
|
2015-07-15 01:57:36 +08:00
|
|
|
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
|
|
|
|
|
|
|
|
// Copy to the old destination registers.
|
2017-04-14 01:53:07 +08:00
|
|
|
BuildMI(*MBB, CI.Paired, DL, CopyDesc)
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(*Dest0) // Copy to same destination including flags and sub reg.
|
|
|
|
.addReg(DestReg, 0, SubRegIdx0);
|
2017-04-14 01:53:07 +08:00
|
|
|
MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
|
2017-01-13 17:58:52 +08:00
|
|
|
.add(*Dest1)
|
|
|
|
.addReg(DestReg, RegState::Kill, SubRegIdx1);
|
2015-07-15 01:57:36 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
moveInstsAfter(Copy1, CI.InstsToMove);
|
2015-07-15 01:57:36 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
MachineBasicBlock::iterator Next = std::next(CI.I);
|
|
|
|
CI.I->eraseFromParent();
|
|
|
|
CI.Paired->eraseFromParent();
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
|
2016-08-30 03:15:22 +08:00
|
|
|
return Next;
|
2014-10-11 06:01:59 +08:00
|
|
|
}
|
|
|
|
|
2017-11-29 08:55:57 +08:00
|
|
|
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
|
|
|
|
if (STM->ldsRequiresM0Init())
|
|
|
|
return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
|
|
|
|
return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
|
|
|
|
if (STM->ldsRequiresM0Init())
|
|
|
|
return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
|
|
|
|
|
|
|
|
return (EltSize == 4) ?
|
|
|
|
AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
|
|
|
|
}
|
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
2017-04-14 01:53:07 +08:00
|
|
|
CombineInfo &CI) {
|
|
|
|
MachineBasicBlock *MBB = CI.I->getParent();
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
|
|
|
|
// sure we preserve the subregister index and any register flags set on them.
|
2018-01-23 05:46:43 +08:00
|
|
|
const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
|
2017-04-14 01:53:07 +08:00
|
|
|
const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
|
2014-10-11 06:01:59 +08:00
|
|
|
const MachineOperand *Data1
|
2017-04-14 01:53:07 +08:00
|
|
|
= TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
|
2014-10-11 06:12:32 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
unsigned NewOffset0 = CI.Offset0;
|
|
|
|
unsigned NewOffset1 = CI.Offset1;
|
2017-11-29 08:55:57 +08:00
|
|
|
unsigned Opc = CI.UseST64 ?
|
|
|
|
write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
|
2014-10-11 06:01:59 +08:00
|
|
|
|
2016-08-27 05:36:47 +08:00
|
|
|
if (NewOffset0 > NewOffset1) {
|
|
|
|
// Canonicalize the merged instruction so the smaller offset comes first.
|
|
|
|
std::swap(NewOffset0, NewOffset1);
|
|
|
|
std::swap(Data0, Data1);
|
|
|
|
}
|
|
|
|
|
2014-10-11 06:12:32 +08:00
|
|
|
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
|
|
|
|
(NewOffset0 != NewOffset1) &&
|
|
|
|
"Computed offset doesn't fit");
|
|
|
|
|
|
|
|
const MCInstrDesc &Write2Desc = TII->get(Opc);
|
2017-04-14 01:53:07 +08:00
|
|
|
DebugLoc DL = CI.I->getDebugLoc();
|
|
|
|
|
2018-01-23 05:46:43 +08:00
|
|
|
unsigned BaseReg = AddrReg->getReg();
|
2017-04-14 01:53:07 +08:00
|
|
|
unsigned BaseRegFlags = 0;
|
|
|
|
if (CI.BaseOff) {
|
2018-01-23 05:46:43 +08:00
|
|
|
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
|
|
|
|
.addImm(CI.BaseOff);
|
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
BaseRegFlags = RegState::Kill;
|
2017-12-01 06:51:26 +08:00
|
|
|
|
2018-01-23 05:46:43 +08:00
|
|
|
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
|
|
|
|
.addReg(ImmReg)
|
|
|
|
.addReg(AddrReg->getReg());
|
2017-04-14 01:53:07 +08:00
|
|
|
}
|
2014-10-11 06:12:32 +08:00
|
|
|
|
2017-04-14 08:33:44 +08:00
|
|
|
MachineInstrBuilder Write2 =
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, Write2Desc)
|
|
|
|
.addReg(BaseReg, BaseRegFlags) // addr
|
|
|
|
.add(*Data0) // data0
|
|
|
|
.add(*Data1) // data1
|
|
|
|
.addImm(NewOffset0) // offset0
|
|
|
|
.addImm(NewOffset1) // offset1
|
|
|
|
.addImm(0) // gds
|
|
|
|
.setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
|
2014-10-11 06:01:59 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
moveInstsAfter(Write2, CI.InstsToMove);
|
2014-10-11 06:01:59 +08:00
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
MachineBasicBlock::iterator Next = std::next(CI.I);
|
|
|
|
CI.I->eraseFromParent();
|
|
|
|
CI.Paired->eraseFromParent();
|
2014-10-11 06:01:59 +08:00
|
|
|
|
|
|
|
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
|
2016-08-30 03:15:22 +08:00
|
|
|
return Next;
|
2014-10-11 06:01:59 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
|
|
|
|
CombineInfo &CI) {
|
|
|
|
MachineBasicBlock *MBB = CI.I->getParent();
|
|
|
|
DebugLoc DL = CI.I->getDebugLoc();
|
|
|
|
unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
|
|
|
|
AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
|
|
|
|
|
|
|
|
const TargetRegisterClass *SuperRC =
|
|
|
|
CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
|
|
|
|
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
|
|
|
|
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
|
|
|
|
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
|
|
|
|
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
|
|
|
|
.addImm(MergedOffset) // offset
|
|
|
|
.addImm(CI.GLC0) // glc
|
|
|
|
.setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
|
|
|
|
|
|
|
|
unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
|
|
|
|
unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
|
|
|
|
|
|
|
|
// Handle descending offsets
|
|
|
|
if (CI.Offset0 > CI.Offset1)
|
|
|
|
std::swap(SubRegIdx0, SubRegIdx1);
|
|
|
|
|
|
|
|
// Copy to the old destination registers.
|
|
|
|
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
|
|
|
|
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
|
|
|
|
const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
|
|
|
|
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, CopyDesc)
|
|
|
|
.add(*Dest0) // Copy to same destination including flags and sub reg.
|
|
|
|
.addReg(DestReg, 0, SubRegIdx0);
|
|
|
|
MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
|
|
|
|
.add(*Dest1)
|
|
|
|
.addReg(DestReg, RegState::Kill, SubRegIdx1);
|
|
|
|
|
|
|
|
moveInstsAfter(Copy1, CI.InstsToMove);
|
|
|
|
|
|
|
|
MachineBasicBlock::iterator Next = std::next(CI.I);
|
|
|
|
CI.I->eraseFromParent();
|
|
|
|
CI.Paired->eraseFromParent();
|
|
|
|
return Next;
|
|
|
|
}
|
|
|
|
|
2017-11-09 09:52:36 +08:00
|
|
|
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
|
2017-11-09 09:52:30 +08:00
|
|
|
CombineInfo &CI) {
|
|
|
|
MachineBasicBlock *MBB = CI.I->getParent();
|
|
|
|
DebugLoc DL = CI.I->getDebugLoc();
|
2017-11-09 09:52:36 +08:00
|
|
|
unsigned Opcode;
|
|
|
|
|
|
|
|
if (CI.InstClass == BUFFER_LOAD_OFFEN) {
|
|
|
|
Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
|
|
|
|
AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
|
|
|
|
} else {
|
|
|
|
Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
|
|
|
|
AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
|
|
|
|
}
|
2017-11-09 09:52:30 +08:00
|
|
|
|
|
|
|
const TargetRegisterClass *SuperRC =
|
|
|
|
CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
|
|
|
|
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
|
|
|
|
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
|
|
|
|
|
2017-11-09 09:52:36 +08:00
|
|
|
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
|
|
|
|
|
|
|
|
if (CI.InstClass == BUFFER_LOAD_OFFEN)
|
|
|
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
|
|
|
|
|
|
|
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
|
2017-11-09 09:52:30 +08:00
|
|
|
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
|
|
|
|
.addImm(MergedOffset) // offset
|
|
|
|
.addImm(CI.GLC0) // glc
|
|
|
|
.addImm(CI.SLC0) // slc
|
|
|
|
.addImm(0) // tfe
|
|
|
|
.setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
|
|
|
|
|
|
|
|
unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
|
|
|
|
unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
|
|
|
|
|
|
|
|
// Handle descending offsets
|
|
|
|
if (CI.Offset0 > CI.Offset1)
|
|
|
|
std::swap(SubRegIdx0, SubRegIdx1);
|
|
|
|
|
|
|
|
// Copy to the old destination registers.
|
|
|
|
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
|
|
|
|
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
|
|
|
|
const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
|
|
|
|
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, CopyDesc)
|
|
|
|
.add(*Dest0) // Copy to same destination including flags and sub reg.
|
|
|
|
.addReg(DestReg, 0, SubRegIdx0);
|
|
|
|
MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
|
|
|
|
.add(*Dest1)
|
|
|
|
.addReg(DestReg, RegState::Kill, SubRegIdx1);
|
|
|
|
|
|
|
|
moveInstsAfter(Copy1, CI.InstsToMove);
|
|
|
|
|
|
|
|
MachineBasicBlock::iterator Next = std::next(CI.I);
|
|
|
|
CI.I->eraseFromParent();
|
|
|
|
CI.Paired->eraseFromParent();
|
|
|
|
return Next;
|
|
|
|
}
|
|
|
|
|
2017-11-09 09:52:55 +08:00
|
|
|
unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
|
|
|
|
const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
|
|
|
|
IsX2 = false;
|
|
|
|
IsOffen = false;
|
|
|
|
|
|
|
|
switch (I.getOpcode()) {
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
|
|
|
|
IsOffen = true;
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
|
|
|
|
IsOffen = true;
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
|
|
|
|
IsX2 = true;
|
|
|
|
IsOffen = true;
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
|
|
|
|
IsX2 = true;
|
|
|
|
IsOffen = true;
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
|
|
|
|
IsX2 = true;
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
|
|
|
|
case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
|
|
|
|
IsX2 = true;
|
|
|
|
return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
|
|
|
|
CombineInfo &CI) {
|
|
|
|
MachineBasicBlock *MBB = CI.I->getParent();
|
|
|
|
DebugLoc DL = CI.I->getDebugLoc();
|
|
|
|
bool Unused1, Unused2;
|
|
|
|
unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
|
|
|
|
|
|
|
|
unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
|
|
|
|
unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
|
|
|
|
|
|
|
|
// Handle descending offsets
|
|
|
|
if (CI.Offset0 > CI.Offset1)
|
|
|
|
std::swap(SubRegIdx0, SubRegIdx1);
|
|
|
|
|
|
|
|
// Copy to the new source register.
|
|
|
|
const TargetRegisterClass *SuperRC =
|
|
|
|
CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
|
|
|
|
unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
|
|
|
|
|
|
|
|
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
|
|
|
|
const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
|
|
|
|
|
|
|
|
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
|
|
|
|
.add(*Src0)
|
|
|
|
.addImm(SubRegIdx0)
|
|
|
|
.add(*Src1)
|
|
|
|
.addImm(SubRegIdx1);
|
|
|
|
|
|
|
|
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
|
|
|
|
.addReg(SrcReg, RegState::Kill);
|
|
|
|
|
|
|
|
if (CI.InstClass == BUFFER_STORE_OFFEN)
|
|
|
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
|
|
|
|
|
|
|
|
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
|
|
|
|
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
|
|
|
|
.addImm(std::min(CI.Offset0, CI.Offset1)) // offset
|
|
|
|
.addImm(CI.GLC0) // glc
|
|
|
|
.addImm(CI.SLC0) // slc
|
|
|
|
.addImm(0) // tfe
|
|
|
|
.setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
|
|
|
|
|
|
|
|
moveInstsAfter(MIB, CI.InstsToMove);
|
|
|
|
|
|
|
|
MachineBasicBlock::iterator Next = std::next(CI.I);
|
|
|
|
CI.I->eraseFromParent();
|
|
|
|
CI.Paired->eraseFromParent();
|
|
|
|
return Next;
|
|
|
|
}
|
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
// Scan through looking for adjacent LDS operations with constant offsets from
|
|
|
|
// the same base register. We rely on the scheduler to do the hard work of
|
|
|
|
// clustering nearby loads, and assume these are all adjacent.
|
|
|
|
bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
|
|
|
|
bool Modified = false;
|
|
|
|
|
|
|
|
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
|
|
|
|
MachineInstr &MI = *I;
|
|
|
|
|
|
|
|
// Don't combine if volatile.
|
|
|
|
if (MI.hasOrderedMemoryRef()) {
|
|
|
|
++I;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-04-14 01:53:07 +08:00
|
|
|
CombineInfo CI;
|
|
|
|
CI.I = I;
|
2014-10-11 06:01:59 +08:00
|
|
|
unsigned Opc = MI.getOpcode();
|
2017-11-29 08:55:57 +08:00
|
|
|
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
|
|
|
|
Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
|
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
CI.InstClass = DS_READ_WRITE;
|
2017-11-29 08:55:57 +08:00
|
|
|
CI.EltSize =
|
|
|
|
(Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
|
|
|
|
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
if (findMatchingInst(CI)) {
|
2014-10-11 06:01:59 +08:00
|
|
|
Modified = true;
|
2017-04-14 01:53:07 +08:00
|
|
|
I = mergeRead2Pair(CI);
|
2014-10-11 06:01:59 +08:00
|
|
|
} else {
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
2017-11-29 08:55:57 +08:00
|
|
|
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
|
|
|
|
Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
|
|
|
|
Opc == AMDGPU::DS_WRITE_B64_gfx9) {
|
2017-11-09 09:52:30 +08:00
|
|
|
CI.InstClass = DS_READ_WRITE;
|
2017-11-29 08:55:57 +08:00
|
|
|
CI.EltSize
|
|
|
|
= (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
|
|
|
|
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
if (findMatchingInst(CI)) {
|
2014-10-11 06:01:59 +08:00
|
|
|
Modified = true;
|
2017-04-14 01:53:07 +08:00
|
|
|
I = mergeWrite2Pair(CI);
|
2014-10-11 06:01:59 +08:00
|
|
|
} else {
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
AMDGPU: Remove the s_buffer workaround for GFX9 chips
Summary:
I checked the AMD closed source compiler and the workaround is only
needed when x3 is emulated as x4, which we don't do in LLVM.
SMEM x3 opcodes don't exist, and instead there is a possibility to use x4
with the last component being unused. If the last component is out of
buffer bounds and falls on the next 4K page, the hw hangs.
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D42756
llvm-svn: 324486
2018-02-08 00:00:40 +08:00
|
|
|
if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
|
|
|
|
Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
// EltSize is in units of the offset encoding.
|
2017-11-09 09:52:30 +08:00
|
|
|
CI.InstClass = S_BUFFER_LOAD_IMM;
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
|
|
|
|
CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
|
|
|
|
if (findMatchingInst(CI)) {
|
|
|
|
Modified = true;
|
|
|
|
I = mergeSBufferLoadImmPair(CI);
|
|
|
|
if (!CI.IsX2)
|
|
|
|
CreatedX2++;
|
|
|
|
} else {
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2017-11-09 09:52:30 +08:00
|
|
|
if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
|
2017-11-09 09:52:36 +08:00
|
|
|
Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
|
|
|
|
Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
|
|
|
|
Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
|
|
|
|
if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
|
|
|
|
Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
|
|
|
|
CI.InstClass = BUFFER_LOAD_OFFEN;
|
|
|
|
else
|
|
|
|
CI.InstClass = BUFFER_LOAD_OFFSET;
|
|
|
|
|
2017-11-09 09:52:30 +08:00
|
|
|
CI.EltSize = 4;
|
2017-11-09 09:52:36 +08:00
|
|
|
CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
|
|
|
|
Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
|
2017-11-09 09:52:30 +08:00
|
|
|
if (findMatchingInst(CI)) {
|
|
|
|
Modified = true;
|
2017-11-09 09:52:36 +08:00
|
|
|
I = mergeBufferLoadPair(CI);
|
2017-11-09 09:52:30 +08:00
|
|
|
if (!CI.IsX2)
|
|
|
|
CreatedX2++;
|
|
|
|
} else {
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2014-10-11 06:01:59 +08:00
|
|
|
|
2017-11-09 09:52:55 +08:00
|
|
|
bool StoreIsX2, IsOffen;
|
|
|
|
if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
|
|
|
|
CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
|
|
|
|
CI.EltSize = 4;
|
|
|
|
CI.IsX2 = StoreIsX2;
|
|
|
|
if (findMatchingInst(CI)) {
|
|
|
|
Modified = true;
|
|
|
|
I = mergeBufferStorePair(CI);
|
|
|
|
if (!CI.IsX2)
|
|
|
|
CreatedX2++;
|
|
|
|
} else {
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
++I;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Modified;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
|
2017-12-16 06:22:58 +08:00
|
|
|
if (skipFunction(MF.getFunction()))
|
2016-04-26 06:23:44 +08:00
|
|
|
return false;
|
|
|
|
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
STM = &MF.getSubtarget<SISubtarget>();
|
|
|
|
if (!STM->loadStoreOptEnabled())
|
2016-06-28 04:32:13 +08:00
|
|
|
return false;
|
|
|
|
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
TII = STM->getInstrInfo();
|
2016-06-24 14:30:11 +08:00
|
|
|
TRI = &TII->getRegisterInfo();
|
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
MRI = &MF.getRegInfo();
|
2016-08-30 03:15:22 +08:00
|
|
|
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
|
2014-10-11 06:01:59 +08:00
|
|
|
|
2017-08-31 09:53:09 +08:00
|
|
|
assert(MRI->isSSA() && "Must be run on SSA");
|
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
|
|
|
|
|
|
|
|
bool Modified = false;
|
|
|
|
|
2017-11-28 16:42:46 +08:00
|
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
|
|
CreatedX2 = 0;
|
2014-10-11 06:01:59 +08:00
|
|
|
Modified |= optimizeBlock(MBB);
|
|
|
|
|
2017-11-28 16:42:46 +08:00
|
|
|
// Run again to convert x2 to x4.
|
|
|
|
if (CreatedX2 >= 1)
|
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
llvm-svn: 317751
2017-11-09 09:52:23 +08:00
|
|
|
Modified |= optimizeBlock(MBB);
|
|
|
|
}
|
|
|
|
|
2014-10-11 06:01:59 +08:00
|
|
|
return Modified;
|
|
|
|
}
|