forked from OSchip/llvm-project
[AMDGPU] Fix dwordx3/southern-islands failures.
This commit fixes the dwordx3/southern-islands failures that were found in bugzilla https://bugs.llvm.org/show_bug.cgi?id=40129, by not generating the dwordx3 variants of load/store instructions that were added to the ISA after southern islands. Differential Revision: https://reviews.llvm.org/D56434 llvm-svn: 350838
This commit is contained in:
parent
57f2c14860
commit
e85d45a699
|
@ -822,6 +822,11 @@ public:
|
|||
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
|
||||
}
|
||||
|
||||
// \returns true if the subtarget supports DWORDX3 load/store instructions.
|
||||
bool hasDwordx3LoadStores() const {
|
||||
return CIInsts;
|
||||
}
|
||||
|
||||
bool hasSMovFedHazard() const {
|
||||
return getGeneration() >= AMDGPUSubtarget::GFX9;
|
||||
}
|
||||
|
|
|
@ -160,7 +160,7 @@ private:
|
|||
bool OptimizeAgain;
|
||||
|
||||
static bool offsetsCanBeCombined(CombineInfo &CI);
|
||||
static bool widthsFit(const CombineInfo &CI);
|
||||
static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
|
||||
static unsigned getNewOpcode(const CombineInfo &CI);
|
||||
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
|
||||
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
|
||||
|
@ -367,11 +367,12 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) {
|
||||
bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
|
||||
const CombineInfo &CI) {
|
||||
const unsigned Width = (CI.Width0 + CI.Width1);
|
||||
switch (CI.InstClass) {
|
||||
default:
|
||||
return Width <= 4;
|
||||
return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
|
||||
case S_BUFFER_LOAD_IMM:
|
||||
switch (Width) {
|
||||
default:
|
||||
|
@ -645,7 +646,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
|
|||
// We also need to go through the list of instructions that we plan to
|
||||
// move and make sure they are all safe to move down past the merged
|
||||
// instruction.
|
||||
if (widthsFit(CI) && offsetsCanBeCombined(CI))
|
||||
if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
|
||||
if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -37,9 +37,10 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias
|
|||
; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
|
||||
; GCN-NOT: v_cvt_f32_ubyte3_e32
|
||||
; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
|
||||
; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]]
|
||||
; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[MDRESULT:[0-9]+]], [[VAL]]
|
||||
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
|
||||
; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
|
||||
; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[MDRESULT]]{{\]}},
|
||||
; VI: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
|
||||
define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
|
||||
|
|
|
@ -60,7 +60,8 @@ endif:
|
|||
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
|
||||
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
|
||||
|
||||
; GCN-DAG: buffer_store_dwordx3
|
||||
; GCN-DAG: buffer_store_dword v
|
||||
; GCN-DAG: buffer_store_dwordx2
|
||||
define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
|
||||
entry:
|
||||
%v = load <3 x i32>, <3 x i32> addrspace(1)* %in
|
||||
|
|
|
@ -195,7 +195,7 @@ main_body:
|
|||
|
||||
;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
|
||||
;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
|
||||
;CHECK: s_waitcnt
|
||||
define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
|
||||
main_body:
|
||||
|
@ -245,7 +245,7 @@ main_body:
|
|||
|
||||
;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged:
|
||||
;CHECK-NEXT: %bb.
|
||||
;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
|
||||
;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
|
||||
;CHECK: s_waitcnt
|
||||
define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s
|
||||
|
||||
; This test is mostly to test DAG store merging, so disable the vectorizer.
|
||||
; Run with devices with different unaligned load restrictions.
|
||||
|
@ -65,8 +65,8 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
|
||||
; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
|
||||
; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
|
||||
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
|
||||
define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
|
||||
|
@ -87,8 +87,8 @@ define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
|
||||
; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
|
||||
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
|
||||
define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
|
||||
|
@ -164,9 +164,10 @@ define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float ad
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
|
||||
; SI-DAG: buffer_store_dwordx3
|
||||
; SI-NOT: buffer_store_dwordx2
|
||||
; SI-NOT: buffer_store_dword
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
; SI-DAG: buffer_store_dword v
|
||||
; CI-DAG: buffer_store_dwordx3
|
||||
; GCN-NOT: buffer_store_dword
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
|
||||
|
@ -274,9 +275,13 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
|
||||
; SI-DAG: buffer_load_dwordx3
|
||||
; SI-DAG: buffer_load_dwordx2
|
||||
; SI-DAG: buffer_load_dword v
|
||||
; CI-DAG: buffer_load_dwordx3
|
||||
; GCN: s_waitcnt
|
||||
; SI-DAG: buffer_store_dwordx3 v
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
; SI-DAG: buffer_store_dword v
|
||||
; CI-DAG: buffer_store_dwordx3
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
|
||||
|
@ -561,7 +566,9 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)*
|
|||
|
||||
; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx3
|
||||
; SI: buffer_store_dwordx2
|
||||
; SI: buffer_store_dword v
|
||||
; CI: buffer_store_dwordx3
|
||||
define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
|
||||
store i32 34, i32 addrspace(1)* %out, align 4
|
||||
%idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
|
||||
|
@ -608,11 +615,15 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)*
|
|||
|
||||
; GCN-LABEL: {{^}}copy_v3i32_align4:
|
||||
; GCN-NOT: SCRATCH_RSRC_DWORD
|
||||
; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
|
||||
; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; GCN-NOT: offen
|
||||
; GCN: s_waitcnt vmcnt
|
||||
; GCN-NOT: offen
|
||||
; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
|
||||
; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
|
||||
; GCN: ScratchSize: 0{{$}}
|
||||
define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
|
||||
|
@ -639,11 +650,15 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou
|
|||
|
||||
; GCN-LABEL: {{^}}copy_v3f32_align4:
|
||||
; GCN-NOT: SCRATCH_RSRC_DWORD
|
||||
; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
|
||||
; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; GCN-NOT: offen
|
||||
; GCN: s_waitcnt vmcnt
|
||||
; GCN-NOT: offen
|
||||
; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
|
||||
; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; GCN: ScratchSize: 0{{$}}
|
||||
define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
|
||||
|
@ -273,7 +273,10 @@ entry:
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_v3i32:
|
||||
; SIVI-DAG: buffer_store_dwordx3
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
; SI-DAG: buffer_store_dword v
|
||||
|
||||
; VI-DAG: buffer_store_dwordx3
|
||||
|
||||
; GFX9-DAG: global_store_dwordx2
|
||||
; GFX9-DAG: global_store_dword v
|
||||
|
|
|
@ -89,7 +89,9 @@ define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
|
||||
; GCN-DAG: buffer_store_dwordx3
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
; SI-DAG: buffer_store_dword v
|
||||
; VI-DAG: buffer_store_dwordx3
|
||||
define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
|
||||
%trunc = trunc <3 x i64> %x to <3 x i32>
|
||||
store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
|
||||
|
|
Loading…
Reference in New Issue