forked from OSchip/llvm-project
[OpenMP] Add an option to limit shared memory usage in OpenMPOpt
One of the optimizations performed in OpenMPOpt pushes globalized variables to static shared memory. This is preferable to keeping the runtime call in all cases, however if too many variables are pushed to hared memory the kernel will crash. Since this is an optimization and not something the user specified explicitly, there should be an option to limit this optimization in those cases. This path introduces the `-openmp-opt-shared-limit=` option to limit the amount of bytes that will be placed in shared memory from HeapToShared. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D120079
This commit is contained in:
parent
0870a4f59a
commit
0136a4401f
|
@ -129,6 +129,11 @@ static cl::opt<unsigned>
|
|||
cl::desc("Maximal number of attributor iterations."),
|
||||
cl::init(256));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
|
||||
cl::desc("Maximum amount of shared memory to use."),
|
||||
cl::init(std::numeric_limits<unsigned>::max()));
|
||||
|
||||
STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
|
||||
"Number of OpenMP runtime calls deduplicated");
|
||||
STATISTIC(NumOpenMPParallelRegionsDeleted,
|
||||
|
@ -3000,6 +3005,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
|
|||
|
||||
auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
|
||||
|
||||
if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
|
||||
LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
|
||||
<< " with shared memory."
|
||||
<< " Shared memory usage is limited to "
|
||||
<< SharedMemoryLimit << " bytes\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
|
||||
<< " with " << AllocSize->getZExtValue()
|
||||
<< " bytes of shared memory\n");
|
||||
|
@ -3034,7 +3047,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
|
|||
A.deleteAfterManifest(*CB);
|
||||
A.deleteAfterManifest(*FreeCalls.front());
|
||||
|
||||
NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
|
||||
SharedMemoryUsed += AllocSize->getZExtValue();
|
||||
NumBytesMovedToSharedMemory = SharedMemoryUsed;
|
||||
Changed = ChangeStatus::CHANGED;
|
||||
}
|
||||
|
||||
|
@ -3070,6 +3084,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
|
|||
SmallSetVector<CallBase *, 4> MallocCalls;
|
||||
/// Collection of potentially removed free calls in a function.
|
||||
SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
|
||||
/// The total amount of shared memory that has been used for HeapToShared.
|
||||
unsigned SharedMemoryUsed = 0;
|
||||
};
|
||||
|
||||
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
|
||||
; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
|
||||
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
|
||||
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output -openmp-opt-shared-limit=4 < %s 2>&1 | FileCheck %s -check-prefix=CHECK-LIMIT
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64"
|
||||
|
||||
|
@ -8,6 +9,8 @@ target triple = "nvptx64"
|
|||
; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory
|
||||
; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
|
||||
; CHECK-REMARKS-NOT: 6 bytes
|
||||
; CHECK-LIMIT: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
|
||||
; CHECK-LIMIT: remark: replace_globalization.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization
|
||||
; UTC_ARGS: --enable
|
||||
|
||||
%struct.ident_t = type { i32, i32, i32, i32, i8* }
|
||||
|
|
Loading…
Reference in New Issue