From b6b53ffef4414ed62701a63ad28e70cfd9d26191 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Tue, 13 Jul 2021 12:06:55 +0100 Subject: [PATCH] [libomptarget][devicertl] Remove branches around setting parallelLevel Simplifies control flow to allow store/load forwarding This change folds two basic blocks into one, leaving a single store to parallelLevel. This is a step towards spmd kernels with sufficiently aggressive inlining folding the loads from parallelLevel and thus discarding the nested parallel handling when it is unused. Transform: ``` int threadId = GetThreadIdInBlock(); if (threadId == 0) { parallelLevel[0] = expr; } else if (GetLaneId() == 0) { parallelLevel[GetWarpId()] = expr; } // => if (GetLaneId() == 0) { parallelLevel[GetWarpId()] = expr; } // because unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1);} // so whenever threadId == 0, GetLaneId() is also 0. ``` That replaces a store in two distinct basic blocks with as single store. A more aggressive follow up is possible if the threads in the warp/wave race to write the same value to the same address. This is not done as part of this change. ``` if (GetLaneId() == 0) { parallelLevel[GetWarpId()] = expr; } // => parallelLevel[GetWarpId()] = expr; // because unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } // so GetWarpId will index the same element for every thread in the warp // and, because expr is lane-invariant in this case, every lane stores the // same value to this unique address ``` Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D105699 --- openmp/libomptarget/deviceRTLs/common/src/omptarget.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu index 153754fc3fdd..21608549edf1 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -90,12 +90,13 @@ static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) { int threadId = GetThreadIdInBlock(); if (threadId == 0) { usedSlotIdx = __kmpc_impl_smid() % MAX_SM; - parallelLevel[0] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); - } else if (GetLaneId() == 0) { + } + + if (GetLaneId() == 0) { parallelLevel[GetWarpId()] = 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); } + __kmpc_data_sharing_init_stack(); if (!RequiresFullRuntime) return;