forked from OSchip/llvm-project
[amdgpu] Enable use of AA during codegen.
- Add an internal option `-amdgpu-use-aa-in-codegen` to enable or disable this feature. By Default, it's enabled. Differential Revision: https://reviews.llvm.org/D89320
This commit is contained in:
parent
bce770ffa6
commit
0d092303b4
|
@ -55,6 +55,10 @@ static cl::opt<bool> EnableFlatScratch(
|
||||||
cl::desc("Use flat scratch instructions"),
|
cl::desc("Use flat scratch instructions"),
|
||||||
cl::init(false));
|
cl::init(false));
|
||||||
|
|
||||||
|
static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
|
||||||
|
cl::desc("Enable the use of AA during codegen."),
|
||||||
|
cl::init(true));
|
||||||
|
|
||||||
GCNSubtarget::~GCNSubtarget() = default;
|
GCNSubtarget::~GCNSubtarget() = default;
|
||||||
|
|
||||||
R600Subtarget &
|
R600Subtarget &
|
||||||
|
@ -608,6 +612,8 @@ bool GCNSubtarget::useVGPRIndexMode() const {
|
||||||
return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
|
return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool GCNSubtarget::useAA() const { return UseAA; }
|
||||||
|
|
||||||
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
|
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
|
||||||
if (getGeneration() >= AMDGPUSubtarget::GFX10)
|
if (getGeneration() >= AMDGPUSubtarget::GFX10)
|
||||||
return getMaxWavesPerEU();
|
return getMaxWavesPerEU();
|
||||||
|
|
|
@ -944,6 +944,8 @@ public:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool useAA() const override;
|
||||||
|
|
||||||
bool enableSubRegLiveness() const override {
|
bool enableSubRegLiveness() const override {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
|
||||||
|
|
||||||
|
|
||||||
; There is no dependence between the store and the two loads. So we can combine the loads
|
; There is no dependence between the store and the two loads. So we can combine
|
||||||
; and the combined load is at the original place of the second load.
|
; the loads and schedule it freely.
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}ds_combine_nodep
|
; GCN-LABEL: {{^}}ds_combine_nodep
|
||||||
|
|
||||||
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
|
; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
|
||||||
; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
|
; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
|
||||||
|
; GCN: s_waitcnt lgkmcnt({{[0-9]+}})
|
||||||
define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {
|
define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {
|
||||||
|
|
||||||
%base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
|
%base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
|
||||||
|
|
|
@ -54,8 +54,8 @@ bb:
|
||||||
|
|
||||||
; uniform load dominated by no-alias store - scalarize
|
; uniform load dominated by no-alias store - scalarize
|
||||||
; CHECK-LABEL: @no_memdep_alias_arg
|
; CHECK-LABEL: @no_memdep_alias_arg
|
||||||
; CHECK: flat_store_dword
|
; CHECK: s_load_dwordx2 s{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[4:5], 0x0
|
||||||
; CHECK: s_load_dword [[SVAL:s[0-9]+]]
|
; CHECK: s_load_dword [[SVAL:s[0-9]+]], s{{\[}}[[IN_LO]]:[[IN_HI]]], 0x0
|
||||||
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
||||||
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
||||||
|
|
||||||
|
|
|
@ -1645,8 +1645,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
||||||
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||||
; CI-NEXT: flat_load_dword v4, v[0:1]
|
; CI-NEXT: flat_load_dword v4, v[0:1]
|
||||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||||
; CI-NEXT: s_mov_b32 s2, 0xffff
|
|
||||||
; CI-NEXT: s_mov_b32 s3, 0
|
; CI-NEXT: s_mov_b32 s3, 0
|
||||||
|
; CI-NEXT: s_mov_b32 s2, 0xffff
|
||||||
; CI-NEXT: v_mov_b32_e32 v3, s1
|
; CI-NEXT: v_mov_b32_e32 v3, s1
|
||||||
; CI-NEXT: s_lshl_b32 s1, s4, 16
|
; CI-NEXT: s_lshl_b32 s1, s4, 16
|
||||||
; CI-NEXT: s_and_b32 s4, s4, s2
|
; CI-NEXT: s_and_b32 s4, s4, s2
|
||||||
|
|
|
@ -8,23 +8,23 @@ define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) {
|
||||||
; MUBUF: ; %bb.0:
|
; MUBUF: ; %bb.0:
|
||||||
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:36
|
; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:36
|
||||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
; MUBUF-NEXT: global_load_dword v11, v[1:2], off offset:32
|
||||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36
|
|
||||||
; MUBUF-NEXT: global_load_dword v0, v[1:2], off offset:32
|
|
||||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
||||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32
|
|
||||||
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
|
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
|
||||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
|
||||||
|
; MUBUF-NEXT: s_waitcnt vmcnt(3)
|
||||||
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36
|
||||||
|
; MUBUF-NEXT: s_waitcnt vmcnt(3)
|
||||||
|
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
|
||||||
|
; MUBUF-NEXT: s_waitcnt vmcnt(3)
|
||||||
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28
|
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28
|
||||||
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24
|
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24
|
||||||
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20
|
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20
|
||||||
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16
|
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16
|
||||||
; MUBUF-NEXT: global_load_dwordx4 v[0:3], v[1:2], off
|
; MUBUF-NEXT: s_waitcnt vmcnt(6)
|
||||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:12
|
||||||
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
|
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8
|
||||||
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
|
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4
|
||||||
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
|
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32
|
||||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32
|
|
||||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||||
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
||||||
;
|
;
|
||||||
|
@ -32,23 +32,23 @@ define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) {
|
||||||
; FLATSCR: ; %bb.0:
|
; FLATSCR: ; %bb.0:
|
||||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||||
; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:36
|
; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:36
|
||||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
; FLATSCR-NEXT: global_load_dword v11, v[1:2], off offset:32
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:36
|
|
||||||
; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:32
|
|
||||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:32
|
|
||||||
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
|
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
|
||||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
|
||||||
|
; FLATSCR-NEXT: s_waitcnt vmcnt(3)
|
||||||
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:36
|
||||||
|
; FLATSCR-NEXT: s_waitcnt vmcnt(3)
|
||||||
|
; FLATSCR-NEXT: scratch_store_dword off, v11, s32 offset:32
|
||||||
|
; FLATSCR-NEXT: s_waitcnt vmcnt(3)
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v6, s32 offset:28
|
; FLATSCR-NEXT: scratch_store_dword off, v6, s32 offset:28
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v5, s32 offset:24
|
; FLATSCR-NEXT: scratch_store_dword off, v5, s32 offset:24
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v4, s32 offset:20
|
; FLATSCR-NEXT: scratch_store_dword off, v4, s32 offset:20
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:16
|
; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:16
|
||||||
; FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[1:2], off
|
; FLATSCR-NEXT: s_waitcnt vmcnt(6)
|
||||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
; FLATSCR-NEXT: scratch_store_dword off, v10, s32 offset:12
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:12
|
; FLATSCR-NEXT: scratch_store_dword off, v9, s32 offset:8
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s32 offset:8
|
; FLATSCR-NEXT: scratch_store_dword off, v8, s32 offset:4
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v1, s32 offset:4
|
; FLATSCR-NEXT: scratch_store_dword off, v7, s32
|
||||||
; FLATSCR-NEXT: scratch_store_dword off, v0, s32
|
|
||||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||||
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
||||||
%alloca = alloca [40 x i8], addrspace(5)
|
%alloca = alloca [40 x i8], addrspace(5)
|
||||||
|
|
|
@ -122,38 +122,26 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar
|
||||||
; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56
|
; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56
|
||||||
; GCN-NEXT: v_add_u32_e32 v1, v1, v2
|
; GCN-NEXT: v_add_u32_e32 v1, v1, v2
|
||||||
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60
|
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
; GCN-NEXT: s_waitcnt vmcnt(12)
|
||||||
; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12
|
; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8
|
||||||
; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16
|
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
|
||||||
; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20
|
; GCN-NEXT: s_waitcnt vmcnt(12)
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:28
|
; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:28
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24
|
||||||
; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32
|
; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16
|
||||||
; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36
|
; GCN-NEXT: s_waitcnt vmcnt(12)
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:44
|
; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:44
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40
|
||||||
; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48
|
; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32
|
||||||
; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52
|
; GCN-NEXT: s_waitcnt vmcnt(12)
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56
|
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
||||||
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60
|
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60
|
||||||
|
; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56
|
||||||
|
; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52
|
||||||
|
; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48
|
||||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||||
bb:
|
bb:
|
||||||
|
|
|
@ -132,8 +132,8 @@ entry:
|
||||||
|
|
||||||
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:1
|
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:1
|
||||||
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:2
|
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:2
|
||||||
|
; HSA-ELTGE8-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[0:3], 0 offen
|
||||||
; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
|
; HSA-ELTGE8: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
|
||||||
|
|
||||||
|
|
||||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
|
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
|
||||||
|
@ -141,8 +141,9 @@ entry:
|
||||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
|
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
|
||||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
|
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
|
||||||
|
|
||||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
|
; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
|
||||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
|
; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}}
|
||||||
|
; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
|
||||||
define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
|
define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
|
||||||
entry:
|
entry:
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
|
@ -168,8 +169,8 @@ entry:
|
||||||
|
|
||||||
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16
|
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16
|
||||||
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24
|
; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24
|
||||||
|
; HSA-ELTGE8-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[0:3], 0 offen
|
||||||
; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen
|
; HSA-ELTGE8: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
|
||||||
|
|
||||||
|
|
||||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
|
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}}
|
||||||
|
@ -177,8 +178,9 @@ entry:
|
||||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
|
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}}
|
||||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
|
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}}
|
||||||
|
|
||||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
|
; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
|
||||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}}
|
; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}}
|
||||||
|
; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
|
||||||
define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
|
define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
|
||||||
entry:
|
entry:
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
|
; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
|
; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
|
; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
|
||||||
|
|
||||||
; We expect a two digit VGPR usage here, not a three digit.
|
; We expect a two digit VGPR usage here, not a three digit.
|
||||||
; CHECK: NumVgprs: {{[0-9][0-9]$}}
|
; CHECK: NumVgprs: {{[0-9][0-9]$}}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s
|
; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s
|
; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s
|
||||||
|
|
||||||
; SI-MINREG: NumSgprs: {{[1-9]$}}
|
; SI-MINREG: NumSgprs: {{[1-9]$}}
|
||||||
; SI-MINREG: NumVgprs: {{[1-9]$}}
|
; SI-MINREG: NumVgprs: {{[1-9]$}}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -verify-machineinstrs < %s | FileCheck %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s
|
||||||
|
|
||||||
; We expect a three digit VGPR usage here since only one wave requested.
|
; We expect a three digit VGPR usage here since only one wave requested.
|
||||||
; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
|
; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
|
||||||
|
|
|
@ -47,9 +47,9 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
|
||||||
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||||
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
||||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000
|
; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000
|
||||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
||||||
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
||||||
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
||||||
|
; CHECK-NEXT: s_waitcnt vmcnt(1)
|
||||||
|
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
|
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
|
||||||
; CHECK-NEXT: BB1_1: ; %bb9
|
; CHECK-NEXT: BB1_1: ; %bb9
|
||||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||||
|
|
Loading…
Reference in New Issue