From 64a9d0876c55a95db04378e3788f0c34caa2edbf Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 14 Oct 2016 18:10:39 +0000 Subject: [PATCH] AMDGPU/SI: Don't allow unaligned scratch access Summary: The hardware doesn't support this. Reviewers: arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, llvm-commits, tony-tye Differential Revision: https://reviews.llvm.org/D25523 llvm-svn: 284257 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +++++ .../test/CodeGen/AMDGPU/flat-address-space.ll | 36 +++++++++++++++---- .../AMDGPU/adjust-alloca-alignment.ll | 2 +- .../AMDGPU/merge-stores-private.ll | 22 ++++++------ 7 files changed, 64 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index e0bde330449e..5f7033e2c32f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -67,6 +67,12 @@ def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "Support unaligned global loads and stores" >; +def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access", + "UnalignedScratchAccess", + "true", + "Support unaligned scratch loads and stores" +>; + def FeatureXNACK : SubtargetFeature<"xnack", "EnableXNACK", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index aba12635453d..bd6984735140 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -85,6 +85,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FP64Denormals(false), FPExceptions(false), FlatForGlobal(false), + UnalignedScratchAccess(false), UnalignedBufferAccess(false), EnableXNACK(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1dd650652015..5682d9993139 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ protected: bool FP64Denormals; bool FPExceptions; bool FlatForGlobal; + bool UnalignedScratchAccess; bool UnalignedBufferAccess; bool EnableXNACK; bool DebuggerInsertNops; @@ -277,6 +278,10 @@ public: return UnalignedBufferAccess; } + bool hasUnalignedScratchAccess() const { + return UnalignedScratchAccess; + } + bool isXNACKEnabled() const { return EnableXNACK; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index be669c112b74..e6329511f3f6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -459,6 +459,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return AlignedBy4; } + // FIXME: We have to be conservative here and assume that flat operations + // will access scratch. If we had access to the IR function, then we + // could determine if any private memory was used in the function. + if (!Subtarget->hasUnalignedScratchAccess() && + (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { + return false; + } + if (Subtarget->hasUnalignedBufferAccess()) { // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 5ca57fd3d350..eeb2a2e95dd8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s ; Disable optimizations in case there are optimizations added that ; specialize away generic pointer accesses. @@ -73,7 +73,7 @@ define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noa ; CHECK: flat_load_dwordx2 define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - %fload = load i64, i64 addrspace(4)* %fptr, align 4 + %fload = load i64, i64 addrspace(4)* %fptr, align 8 store i64 %fload, i64 addrspace(1)* %out, align 8 ret void } @@ -82,7 +82,7 @@ define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa ; CHECK: flat_load_dwordx4 define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4 + %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 ret void } @@ -127,6 +127,30 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* ret void } +; CHECK-LABEL: flat_scratch_unaligned_load: +; CHECK: flat_load_ubyte +; CHECK: flat_load_ubyte +; CHECK: flat_load_ubyte +; CHECK: flat_load_ubyte +define void @flat_scratch_unaligned_load() { + %scratch = alloca i32 + %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* + %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1 + ret void +} + +; CHECK-LABEL: flat_scratch_unaligned_store: +; CHECK: flat_store_byte +; CHECK: flat_store_byte +; CHECK: flat_store_byte +; CHECK: flat_store_byte +define void @flat_scratch_unaligned_store() { + %scratch = alloca i32 + %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %fptr, align 1 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind convergent } attributes #3 = { nounwind readnone } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll index fbd9440d757e..4369dafa4258 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s -; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s +; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" target triple = "amdgcn--" diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll index 4a4237294294..fd0aaa615db0 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -1,19 +1,21 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT4 -check-prefix=ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8 -check-prefix=ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16 -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8-UNALIGNED -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16-UNALIGNED -check-prefix=ALL %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 -; ELT4: store i32 -; ELT4: store i32 -; ELT4: store i32 -; ELT4: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 -; ELT8: store <2 x i32> -; ELT8: store <2 x i32> +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store <2 x i32> -; ELT16: store <4 x i32> +; ELT16-UNALIGNED: store <4 x i32> define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 { %out.gep.1 = getelementptr i32, i32* %out, i32 1 %out.gep.2 = getelementptr i32, i32* %out, i32 2