forked from OSchip/llvm-project
AMDGPU/SI: Don't allow unaligned scratch access
Summary: The hardware doesn't support this. Reviewers: arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, llvm-commits, tony-tye Differential Revision: https://reviews.llvm.org/D25523 llvm-svn: 284257
This commit is contained in:
parent
aaa44fe5cd
commit
64a9d0876c
|
@ -67,6 +67,12 @@ def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
|
|||
"Support unaligned global loads and stores"
|
||||
>;
|
||||
|
||||
def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
|
||||
"UnalignedScratchAccess",
|
||||
"true",
|
||||
"Support unaligned scratch loads and stores"
|
||||
>;
|
||||
|
||||
def FeatureXNACK : SubtargetFeature<"xnack",
|
||||
"EnableXNACK",
|
||||
"true",
|
||||
|
|
|
@ -85,6 +85,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
|||
FP64Denormals(false),
|
||||
FPExceptions(false),
|
||||
FlatForGlobal(false),
|
||||
UnalignedScratchAccess(false),
|
||||
UnalignedBufferAccess(false),
|
||||
|
||||
EnableXNACK(false),
|
||||
|
|
|
@ -76,6 +76,7 @@ protected:
|
|||
bool FP64Denormals;
|
||||
bool FPExceptions;
|
||||
bool FlatForGlobal;
|
||||
bool UnalignedScratchAccess;
|
||||
bool UnalignedBufferAccess;
|
||||
bool EnableXNACK;
|
||||
bool DebuggerInsertNops;
|
||||
|
@ -277,6 +278,10 @@ public:
|
|||
return UnalignedBufferAccess;
|
||||
}
|
||||
|
||||
bool hasUnalignedScratchAccess() const {
|
||||
return UnalignedScratchAccess;
|
||||
}
|
||||
|
||||
bool isXNACKEnabled() const {
|
||||
return EnableXNACK;
|
||||
}
|
||||
|
|
|
@ -459,6 +459,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
|
|||
return AlignedBy4;
|
||||
}
|
||||
|
||||
// FIXME: We have to be conservative here and assume that flat operations
|
||||
// will access scratch. If we had access to the IR function, then we
|
||||
// could determine if any private memory was used in the function.
|
||||
if (!Subtarget->hasUnalignedScratchAccess() &&
|
||||
(AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
|
||||
AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Subtarget->hasUnalignedBufferAccess()) {
|
||||
// If we have an uniform constant load, it still requires using a slow
|
||||
// buffer instruction if unaligned.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
|
||||
; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
|
||||
; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
|
||||
; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
|
||||
; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
|
||||
|
||||
; Disable optimizations in case there are optimizations added that
|
||||
; specialize away generic pointer accesses.
|
||||
|
@ -73,7 +73,7 @@ define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noa
|
|||
; CHECK: flat_load_dwordx2
|
||||
define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
|
||||
%fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
|
||||
%fload = load i64, i64 addrspace(4)* %fptr, align 4
|
||||
%fload = load i64, i64 addrspace(4)* %fptr, align 8
|
||||
store i64 %fload, i64 addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
@ -82,7 +82,7 @@ define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
|
|||
; CHECK: flat_load_dwordx4
|
||||
define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
|
||||
%fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
|
||||
%fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4
|
||||
%fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
|
||||
store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
@ -127,6 +127,30 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: flat_scratch_unaligned_load:
|
||||
; CHECK: flat_load_ubyte
|
||||
; CHECK: flat_load_ubyte
|
||||
; CHECK: flat_load_ubyte
|
||||
; CHECK: flat_load_ubyte
|
||||
define void @flat_scratch_unaligned_load() {
|
||||
%scratch = alloca i32
|
||||
%fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
|
||||
%ld = load volatile i32, i32 addrspace(4)* %fptr, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: flat_scratch_unaligned_store:
|
||||
; CHECK: flat_store_byte
|
||||
; CHECK: flat_store_byte
|
||||
; CHECK: flat_store_byte
|
||||
; CHECK: flat_store_byte
|
||||
define void @flat_scratch_unaligned_store() {
|
||||
%scratch = alloca i32
|
||||
%fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
|
||||
store volatile i32 0, i32 addrspace(4)* %fptr, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind convergent }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
|
||||
; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
|
||||
; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
target triple = "amdgcn--"
|
||||
|
|
|
@ -1,19 +1,21 @@
|
|||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT4 -check-prefix=ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8 -check-prefix=ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16 -check-prefix=ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8-UNALIGNED -check-prefix=ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s
|
||||
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16-UNALIGNED -check-prefix=ALL %s
|
||||
|
||||
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32
|
||||
; ELT4: store i32
|
||||
; ELT4: store i32
|
||||
; ELT4: store i32
|
||||
; ELT4: store i32
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
; ALIGNED: store i32
|
||||
|
||||
; ELT8: store <2 x i32>
|
||||
; ELT8: store <2 x i32>
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
; ELT8-UNALIGNED: store <2 x i32>
|
||||
|
||||
; ELT16: store <4 x i32>
|
||||
; ELT16-UNALIGNED: store <4 x i32>
|
||||
define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32* %out, i32 2
|
||||
|
|
Loading…
Reference in New Issue