forked from OSchip/llvm-project
[AMDGPU] Add SIMemoryLegalizer comments to clarify bit usage
Attempt to further document the intended cache policies requested by different combinations of GLC, SLC and DLC bits. GFX10 non-temporal stores are updated to set GLC. Reviewed By: t-tye Differential Revision: https://reviews.llvm.org/D114351
This commit is contained in:
parent
4af45f10cc
commit
8967d044fc
|
@ -795,6 +795,8 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
|
|||
switch (Scope) {
|
||||
case SIAtomicScope::SYSTEM:
|
||||
case SIAtomicScope::AGENT:
|
||||
// Set L1 cache policy to MISS_EVICT.
|
||||
// Note: there is no L2 cache bypass policy at the ISA level.
|
||||
Changed |= enableGLCBit(MI);
|
||||
break;
|
||||
case SIAtomicScope::WORKGROUP:
|
||||
|
@ -837,8 +839,10 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
|
|||
assert(MI->mayLoad() && MI->mayStore());
|
||||
bool Changed = false;
|
||||
|
||||
/// The L1 cache is write through so does not need to be bypassed. There is no
|
||||
/// bypass control for the L2 cache at the isa level.
|
||||
/// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
|
||||
/// bypassed, and the GLC bit is instead used to indicate if they are
|
||||
/// return or no-return.
|
||||
/// Note: there is no L2 cache coherent bypass control at the ISA level.
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
@ -860,6 +864,9 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
|
|||
bool Changed = false;
|
||||
|
||||
if (IsVolatile) {
|
||||
// Set L1 cache policy to be MISS_EVICT for load instructions
|
||||
// and MISS_LRU for store instructions.
|
||||
// Note: there is no L2 cache bypass policy at the ISA level.
|
||||
if (Op == SIMemOp::LOAD)
|
||||
Changed |= enableGLCBit(MI);
|
||||
|
||||
|
@ -875,7 +882,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
|
|||
}
|
||||
|
||||
if (IsNonTemporal) {
|
||||
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
|
||||
// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
|
||||
// for both loads and stores, and the L2 cache policy to STREAM.
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableSLCBit(MI);
|
||||
return Changed;
|
||||
|
@ -1097,6 +1105,8 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass(
|
|||
switch (Scope) {
|
||||
case SIAtomicScope::SYSTEM:
|
||||
case SIAtomicScope::AGENT:
|
||||
// Set the L1 cache policy to MISS_LRU.
|
||||
// Note: there is no L2 cache bypass policy at the ISA level.
|
||||
Changed |= enableGLCBit(MI);
|
||||
break;
|
||||
case SIAtomicScope::WORKGROUP:
|
||||
|
@ -1206,6 +1216,9 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
|
|||
bool Changed = false;
|
||||
|
||||
if (IsVolatile) {
|
||||
// Set L1 cache policy to be MISS_EVICT for load instructions
|
||||
// and MISS_LRU for store instructions.
|
||||
// Note: there is no L2 cache bypass policy at the ISA level.
|
||||
if (Op == SIMemOp::LOAD)
|
||||
Changed |= enableGLCBit(MI);
|
||||
|
||||
|
@ -1221,7 +1234,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
|
|||
}
|
||||
|
||||
if (IsNonTemporal) {
|
||||
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
|
||||
// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
|
||||
// for both loads and stores, and the L2 cache policy to STREAM.
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableSLCBit(MI);
|
||||
return Changed;
|
||||
|
@ -1380,12 +1394,11 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
|
|||
bool Changed = false;
|
||||
|
||||
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
||||
/// TODO Do not set glc for rmw atomic operations as they
|
||||
/// implicitly bypass the L0/L1 caches.
|
||||
|
||||
switch (Scope) {
|
||||
case SIAtomicScope::SYSTEM:
|
||||
case SIAtomicScope::AGENT:
|
||||
// Set the L0 and L1 cache policies to MISS_EVICT.
|
||||
// Note: there is no L2 cache coherent bypass control at the ISA level.
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableDLCBit(MI);
|
||||
break;
|
||||
|
@ -1434,6 +1447,9 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
|
|||
bool Changed = false;
|
||||
|
||||
if (IsVolatile) {
|
||||
// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
|
||||
// and MISS_LRU for store instructions.
|
||||
// Note: there is no L2 cache coherent bypass control at the ISA level.
|
||||
if (Op == SIMemOp::LOAD) {
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableDLCBit(MI);
|
||||
|
@ -1450,8 +1466,14 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
|
|||
}
|
||||
|
||||
if (IsNonTemporal) {
|
||||
// Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
|
||||
// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
|
||||
// and L2 cache policy to STREAM.
|
||||
// For stores setting both GLC and SLC configures L0 and L1 cache policy
|
||||
// to MISS_EVICT and the L2 cache policy to STREAM.
|
||||
if (Op == SIMemOp::STORE)
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableSLCBit(MI);
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
|
|
@ -216,7 +216,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
|
|||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_nontemporal_store_0:
|
||||
|
@ -229,7 +229,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
|
|||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
|
||||
|
@ -306,7 +306,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
|
|||
; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
|
||||
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_nontemporal_store_1:
|
||||
|
@ -320,7 +320,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
|
|||
; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
|
||||
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
|
||||
|
|
|
@ -239,7 +239,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
|
|||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] glc slc
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_nontemporal_store_0:
|
||||
|
@ -250,7 +250,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
|
|||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] glc slc
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0:
|
||||
|
@ -335,7 +335,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
|
|||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] glc slc
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_nontemporal_store_1:
|
||||
|
@ -346,7 +346,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
|
|||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] glc slc
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1:
|
||||
|
|
|
@ -309,7 +309,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
|
|||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc
|
||||
; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: private_nontemporal_store_0:
|
||||
|
@ -326,7 +326,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
|
|||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc
|
||||
; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
|
||||
|
@ -434,7 +434,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
|
|||
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc
|
||||
; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: private_nontemporal_store_1:
|
||||
|
@ -451,7 +451,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
|
|||
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc
|
||||
; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
|
||||
|
|
Loading…
Reference in New Issue