forked from OSchip/llvm-project
[AMDGPU] Workaround for LDS Misalignment bug on GFX10
Add subtarget feature check to avoid using ds_read/write_b96/128 with too low alignment if a bug is present on that specific hardware. Add this "feature" to GFX 10.1.1 as it is also affected. Add global-isel test.
This commit is contained in:
parent
3a577f5446
commit
43af2a6faa
|
@ -163,7 +163,7 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
|
|||
def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
|
||||
"LDSMisalignedBug",
|
||||
"true",
|
||||
"Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
|
||||
"Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode"
|
||||
>;
|
||||
|
||||
def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
|
||||
|
@ -929,6 +929,7 @@ def FeatureISAVersion10_1_1 : FeatureSet<
|
|||
FeatureSMemTimeInst,
|
||||
FeatureMadMacF32Insts,
|
||||
FeatureDsSrc2Insts,
|
||||
FeatureLdsMisalignedBug,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureCodeObjectV3])>;
|
||||
|
||||
|
|
|
@ -1417,8 +1417,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
|
|||
}
|
||||
if (Size == 96) {
|
||||
// ds_read/write_b96 require 16-byte alignment on gfx8 and older.
|
||||
bool Aligned =
|
||||
Alignment >= Align(Subtarget->hasUnalignedDSAccess() ? 4 : 16);
|
||||
bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() &&
|
||||
!Subtarget->hasLDSMisalignedBug())
|
||||
? 4
|
||||
: 16);
|
||||
if (IsFast)
|
||||
*IsFast = Aligned;
|
||||
|
||||
|
@ -1428,8 +1430,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
|
|||
// ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
|
||||
// can do a 8 byte aligned, 16 byte access in a single operation using
|
||||
// ds_read2/write2_b64.
|
||||
bool Aligned =
|
||||
Alignment >= Align(Subtarget->hasUnalignedDSAccess() ? 4 : 8);
|
||||
bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() &&
|
||||
!Subtarget->hasLDSMisalignedBug())
|
||||
? 4
|
||||
: 8);
|
||||
if (IsFast)
|
||||
*IsFast = Aligned;
|
||||
|
||||
|
|
|
@ -0,0 +1,128 @@
|
|||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v2:
|
||||
; GCN-DAG: ds_read2_b32
|
||||
; GCN-DAG: ds_write2_b32
|
||||
define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
||||
%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
|
||||
%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
|
||||
%v1 = extractelement <2 x i32> %load, i32 0
|
||||
%v2 = extractelement <2 x i32> %load, i32 1
|
||||
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
|
||||
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
||||
store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v4:
|
||||
; VECT-DAG: ds_read_b128
|
||||
; VECT-DAG: ds_write_b128
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
||||
%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
|
||||
%v1 = extractelement <4 x i32> %load, i32 0
|
||||
%v2 = extractelement <4 x i32> %load, i32 1
|
||||
%v3 = extractelement <4 x i32> %load, i32 2
|
||||
%v4 = extractelement <4 x i32> %load, i32 3
|
||||
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
|
||||
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
||||
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
||||
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
||||
store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v3:
|
||||
; VECT-DAG: ds_read_b96
|
||||
; VECT-DAG: ds_write_b96
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write_b32
|
||||
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
||||
%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
|
||||
%v1 = extractelement <3 x i32> %load, i32 0
|
||||
%v2 = extractelement <3 x i32> %load, i32 1
|
||||
%v3 = extractelement <3 x i32> %load, i32 2
|
||||
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
|
||||
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
||||
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
||||
store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_aligned_v2:
|
||||
; GCN-DAG: ds_read_b64
|
||||
; GCN-DAG: ds_write_b64
|
||||
define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
||||
%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
|
||||
%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
|
||||
%v1 = extractelement <2 x i32> %load, i32 0
|
||||
%v2 = extractelement <2 x i32> %load, i32 1
|
||||
%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
|
||||
%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
|
||||
store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_aligned_v3:
|
||||
; GCN-DAG: ds_read_b96
|
||||
; GCN-DAG: ds_write_b96
|
||||
define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
||||
%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
|
||||
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
|
||||
%v1 = extractelement <3 x i32> %load, i32 0
|
||||
%v2 = extractelement <3 x i32> %load, i32 1
|
||||
%v3 = extractelement <3 x i32> %load, i32 2
|
||||
%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
|
||||
%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
|
||||
%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
|
||||
store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_local_v4_aligned8:
|
||||
; GCN-DAG: ds_read_b128
|
||||
; GCN-DAG: ds_write_b128
|
||||
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
|
||||
%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
|
||||
%v1 = extractelement <4 x i32> %load, i32 0
|
||||
%v2 = extractelement <4 x i32> %load, i32 1
|
||||
%v3 = extractelement <4 x i32> %load, i32 2
|
||||
%v4 = extractelement <4 x i32> %load, i32 3
|
||||
%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
|
||||
%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
|
||||
%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
|
||||
%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
|
||||
store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VECT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
|
||||
|
||||
|
@ -21,8 +21,12 @@ bb:
|
|||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v4:
|
||||
; GCN-DAG: ds_read_b128
|
||||
; GCN-DAG: ds_write_b128
|
||||
; VECT-DAG: ds_read_b128
|
||||
; VECT-DAG: ds_write_b128
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -42,8 +46,12 @@ bb:
|
|||
}
|
||||
|
||||
; GCN-LABEL: test_local_misaligned_v3:
|
||||
; GCN-DAG: ds_read_b96
|
||||
; GCN-DAG: ds_write_b96
|
||||
; VECT-DAG: ds_read_b96
|
||||
; VECT-DAG: ds_write_b96
|
||||
; SPLIT-DAG: ds_read2_b32
|
||||
; SPLIT-DAG: ds_read_b32
|
||||
; SPLIT-DAG: ds_write2_b32
|
||||
; SPLIT-DAG: ds_write_b32
|
||||
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
|
||||
bb:
|
||||
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
Loading…
Reference in New Issue