lammps/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp

95 lines
3.4 KiB
C++

// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#include "cudpp_maximal_launch.h"
inline size_t min(size_t x, size_t y)
{
return (x <= y) ? x : y;
}
inline size_t max(size_t x, size_t y)
{
return (x >= y) ? x : y;
}
// computes next highest multiple of f from x
inline size_t multiple(size_t x, size_t f)
{
return ((x + (f-1)) / f);
}
// MS Excel-style CEIL() function
// Rounds x up to nearest multiple of f
inline size_t ceiling(size_t x, size_t f)
{
return multiple(x, f) * f;
}
extern "C"
size_t maxBlocks(cudaFuncAttributes &attribs,
cudaDeviceProp &devprop,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock)
{
// Determine the maximum number of CTAs that can be run simultaneously for each kernel
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
const unsigned int warpAllocationMultiple = 2;
const unsigned int smemAllocationUnit = 512; // in bytes
const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
const unsigned int maxBlocksPerSM = 8;
// Number of warps (round up to nearest whole multiple of warp size)
size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
// Round up to warp allocation multiple
numWarps = ceiling(numWarps, warpAllocationMultiple);
// Number of regs is regs per thread times number of warps times warp size
size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
// Round up to multiple of register allocation unit size
regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem;
size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
}
extern "C"
size_t maxBlocksFromPointer(void* kernel,
size_t bytesDynamicSharedMem,
size_t threadsPerBlock)
{
cudaDeviceProp devprop;
int deviceID = -1;
cudaError_t err = cudaGetDevice(&deviceID);
if (err == cudaSuccess)
{
err = cudaGetDeviceProperties(&devprop, deviceID);
if (err != cudaSuccess)
return -1;
cudaFuncAttributes attr;
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
if (err != cudaSuccess)
return -1;
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
}
return -1;
}