AMDGPU: Define priorities for register classes

Allocating larger register classes first should give better allocation
results (and more importantly for myself, make the lit tests more stable
with respect to scheduler changes).

Patch by Matthias Braun

llvm-svn: 270312
This commit is contained in:
Matt Arsenault 2016-05-21 03:55:07 +00:00
parent 97565ded80
commit 7f9eabd2c2
10 changed files with 74 additions and 51 deletions

View File

@ -124,7 +124,9 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add (sequence "SGPR%u", 0, 103))>;
(add (sequence "SGPR%u", 0, 103))> {
let AllocationPriority = 1;
}
// SGPR 64-bit registers
def SGPR_64Regs : RegisterTuples<[sub0, sub1],
@ -189,7 +191,9 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
// VGPR 32-bit registers
def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add (sequence "VGPR%u", 0, 255))>;
(add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;
}
// VGPR 64-bit registers
def VGPR_64 : RegisterTuples<[sub0, sub1],
@ -253,51 +257,63 @@ class RegImmMatcher<string name> : AsmOperandClass {
// See comments in SIInstructions.td for more info.
def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)
>;
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
let AllocationPriority = 1;
}
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add SReg_32_XM0, M0)
>;
(add SReg_32_XM0, M0)> {
let AllocationPriority = 1;
}
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
let AllocationPriority = 2;
}
def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
let isAllocatable = 0;
}
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
(add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)
>;
(add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> {
let AllocationPriority = 2;
}
// Requires 2 s_mov_b64 to copy
let CopyCost = 2 in {
def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)>;
def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
let AllocationPriority = 4;
}
def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
let isAllocatable = 0;
}
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)>;
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
let AllocationPriority = 4;
}
} // End CopyCost = 2
def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
let AllocationPriority = 5;
}
def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
let AllocationPriority = 6;
}
// Register class for all vector registers (VGPRs + Interploation Registers)
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
// Requires 2 v_mov_b32 to copy
let CopyCost = 2;
let AllocationPriority = 2;
}
def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
@ -305,19 +321,23 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
// Requires 3 v_mov_b32 to copy
let CopyCost = 3;
let AllocationPriority = 3;
}
def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
// Requires 4 v_mov_b32 to copy
let CopyCost = 4;
let AllocationPriority = 4;
}
def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
let CopyCost = 8;
let AllocationPriority = 5;
}
def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
let CopyCost = 16;
let AllocationPriority = 6;
}
def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {

View File

@ -1,9 +1,9 @@
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
; RUN: llc -mattr=+promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE -check-prefix=HSA %s
; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA -check-prefix=HSA %s
; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@ -11,9 +11,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; Make sure we don't overwrite workitem information with private memory
; FUNC-LABEL: {{^}}work_item_info:
; SI-NOT: v_mov_b32_e{{(32|64)}} v0
; GCN-LABEL: {{^}}work_item_info:
; GCN-NOT: v0
; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
; GCN: buffer_store_dword [[RESULT]]
define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
entry:
%0 = alloca [2 x i32]

View File

@ -398,9 +398,8 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN-NOT: v_cvt_f32_f16
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32

View File

@ -725,7 +725,7 @@ define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)
; an immediate.
; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
; SI: ds_read_b32 v0, v[[ZERO]] offset:4
; SI: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
; R600: LDS_READ_RET
define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
entry:

View File

@ -6,12 +6,13 @@
; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
; GCN-NOT: v_mov_b32
; GCN-NEXT: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
; GCN-NOT: v_mov_b32
; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]

View File

@ -51,14 +51,16 @@ done: ; preds = %loop
}
; Test moving an SMRD instruction to the VALU
; FIXME: movs can be moved before nop to reduce count
; GCN-LABEL: {{^}}smrd_valu:
; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
; SI: s_mov_b32
; SI: s_nop 2
; SI: s_nop 3
; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
; SI: s_mov_b32
; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
; GCN-NOHSA: buffer_store_dword [[V_OUT]]

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
; FUNC-LABEL: {{^}}cluster_arg_loads:
; FIXME: Due to changes in the load clustering heuristics. We no longer
@ -9,9 +9,9 @@
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
store i32 %x, i32 addrspace(1)* %out0, align 4
store i32 %y, i32 addrspace(1)* %out1, align 4

View File

@ -12,16 +12,16 @@
; GCN-LABEL: {{^}}main:
; GCN-DAG: s_mov_b32 s6, s12
; GCN-DAG: s_mov_b32 s13, s12
; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s18, -1
; SI-DAG: s_mov_b32 s19, 0x88f000
; VI-DAG: s_mov_b32 s19, 0x880000
; s6 is offset system SGPR
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s6 offset:{{[0-9]+}} ; 16-byte Folded Reload
; s13 is offset system SGPR
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024

View File

@ -6,9 +6,9 @@
; for the original bug.
; GCN: {{^}}test:
; GCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
define void @test(i32 addrspace(1)* %out, i32 %in) {
store volatile i32 0, i32 addrspace(1)* %out
%val = load volatile i32, i32 addrspace(1)* %out

View File

@ -310,16 +310,16 @@ main_body:
; ... but only if WQM is necessary.
;
;CHECK-LABEL: {{^}}test_kill_1:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;SI: buffer_store_dword
;VI: flat_store_dword
;CHECK-NOT: wqm
;CHECK: v_cmpx_
; CHECK-LABEL: {{^}}test_kill_1:
; CHECK-NEXT: ; %main_body
; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: s_wqm_b64 exec, exec
; CHECK: image_sample
; CHECK: s_and_b64 exec, exec, [[ORIG]]
; SI: buffer_store_dword
; VI: flat_store_dword
; CHECK-NOT: wqm
; CHECK: v_cmpx_
define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)