[OpenMP] Add an option to limit shared memory usage in OpenMPOpt

One of the optimizations performed in OpenMPOpt pushes globalized
variables to static shared memory. This is preferable to keeping the
runtime call in all cases, however if too many variables are pushed to
hared memory the kernel will crash. Since this is an optimization and
not something the user specified explicitly, there should be an option
to limit this optimization in those cases. This path introduces the
`-openmp-opt-shared-limit=` option to limit the amount of bytes that
will be placed in shared memory from HeapToShared.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120079
This commit is contained in:
Joseph Huber 2022-02-17 15:48:12 -05:00
parent 0870a4f59a
commit 0136a4401f
2 changed files with 20 additions and 1 deletions

View File

@ -129,6 +129,11 @@ static cl::opt<unsigned>
cl::desc("Maximal number of attributor iterations."), cl::desc("Maximal number of attributor iterations."),
cl::init(256)); cl::init(256));
static cl::opt<unsigned>
SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
cl::desc("Maximum amount of shared memory to use."),
cl::init(std::numeric_limits<unsigned>::max()));
STATISTIC(NumOpenMPRuntimeCallsDeduplicated, STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
"Number of OpenMP runtime calls deduplicated"); "Number of OpenMP runtime calls deduplicated");
STATISTIC(NumOpenMPParallelRegionsDeleted, STATISTIC(NumOpenMPParallelRegionsDeleted,
@ -3000,6 +3005,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0)); auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
<< " with shared memory."
<< " Shared memory usage is limited to "
<< SharedMemoryLimit << " bytes\n");
continue;
}
LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
<< " with " << AllocSize->getZExtValue() << " with " << AllocSize->getZExtValue()
<< " bytes of shared memory\n"); << " bytes of shared memory\n");
@ -3034,7 +3047,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
A.deleteAfterManifest(*CB); A.deleteAfterManifest(*CB);
A.deleteAfterManifest(*FreeCalls.front()); A.deleteAfterManifest(*FreeCalls.front());
NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); SharedMemoryUsed += AllocSize->getZExtValue();
NumBytesMovedToSharedMemory = SharedMemoryUsed;
Changed = ChangeStatus::CHANGED; Changed = ChangeStatus::CHANGED;
} }
@ -3070,6 +3084,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
SmallSetVector<CallBase *, 4> MallocCalls; SmallSetVector<CallBase *, 4> MallocCalls;
/// Collection of potentially removed free calls in a function. /// Collection of potentially removed free calls in a function.
SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls; SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
/// The total amount of shared memory that has been used for HeapToShared.
unsigned SharedMemoryUsed = 0;
}; };
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s ; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output -openmp-opt-shared-limit=4 < %s 2>&1 | FileCheck %s -check-prefix=CHECK-LIMIT
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64" target triple = "nvptx64"
@ -8,6 +9,8 @@ target triple = "nvptx64"
; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory ; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory
; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
; CHECK-REMARKS-NOT: 6 bytes ; CHECK-REMARKS-NOT: 6 bytes
; CHECK-LIMIT: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
; CHECK-LIMIT: remark: replace_globalization.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization
; UTC_ARGS: --enable ; UTC_ARGS: --enable
%struct.ident_t = type { i32, i32, i32, i32, i8* } %struct.ident_t = type { i32, i32, i32, i32, i8* }