[OpenMP] Optimized default kernel launch parameters in CUDA plugin

Differential Revision: https://reviews.llvm.org/D32321

llvm-svn: 301321
This commit is contained in:
George Rokos 2017-04-25 16:34:13 +00:00
parent 8264ed7075
commit c13df8e5e0
1 changed files with 30 additions and 11 deletions

View File

@ -51,8 +51,9 @@ struct FuncOrGblEntryTy {
};
enum ExecutionModeType {
SPMD,
GENERIC,
SPMD, // constructors, destructors,
// combined constructs (`teams distribute parallel for [simd]`)
GENERIC, // everything else
NONE
};
@ -99,7 +100,7 @@ public:
static const int HardTeamLimit = 1<<16; // 64k
static const int HardThreadLimit = 1024;
static const int DefaultNumTeams = 128;
static const int DefaultNumThreads = 1024;
static const int DefaultNumThreads = 128;
// Record entry point associated with device
void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
@ -581,18 +582,17 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
if (thread_limit > 0) {
cudaThreadsPerBlock = thread_limit;
DP("Setting CUDA threads per block to requested %d\n", thread_limit);
// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {
cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
}
} else {
cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
DP("Setting CUDA threads per block to default %d\n",
DeviceInfo.NumThreads[device_id]);
}
// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {
cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
}
if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
DP("Threads per block capped at device limit %d\n",
@ -612,8 +612,27 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
int cudaBlocksPerGrid;
if (team_num <= 0) {
if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
// round up to the nearest integer
cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
if (KernelInfo->ExecutionMode == SPMD) {
// We have a combined construct, i.e. `target teams distribute parallel
// for [simd]`. We launch so many teams so that each thread will
// execute one iteration of the loop.
// round up to the nearest integer
cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
} else {
// If we reach this point, then we have a non-combined construct, i.e.
// `teams distribute` with a nested `parallel for` and each team is
// assigned one iteration of the `distribute` loop. E.g.:
//
// #pragma omp target teams distribute
// for(...loop_tripcount...) {
// #pragma omp parallel for
// for(...) {}
// }
//
// Threads within a team will execute the iterations of the `parallel`
// loop.
cudaBlocksPerGrid = loop_tripcount;
}
DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
"threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
cudaThreadsPerBlock);