forked from lijiext/lammps
95 lines
3.4 KiB
C++
95 lines
3.4 KiB
C++
// -------------------------------------------------------------
|
|
// cuDPP -- CUDA Data Parallel Primitives library
|
|
// -------------------------------------------------------------
|
|
// $Revision$
|
|
// $Date$
|
|
// -------------------------------------------------------------
|
|
// This source code is distributed under the terms of license.txt
|
|
// in the root directory of this source distribution.
|
|
// -------------------------------------------------------------
|
|
#include "cudpp_maximal_launch.h"
|
|
|
|
inline size_t min(size_t x, size_t y)
|
|
{
|
|
return (x <= y) ? x : y;
|
|
}
|
|
|
|
inline size_t max(size_t x, size_t y)
|
|
{
|
|
return (x >= y) ? x : y;
|
|
}
|
|
|
|
// computes next highest multiple of f from x
|
|
inline size_t multiple(size_t x, size_t f)
|
|
{
|
|
return ((x + (f-1)) / f);
|
|
}
|
|
|
|
|
|
// MS Excel-style CEIL() function
|
|
// Rounds x up to nearest multiple of f
|
|
inline size_t ceiling(size_t x, size_t f)
|
|
{
|
|
return multiple(x, f) * f;
|
|
}
|
|
|
|
extern "C"
|
|
size_t maxBlocks(cudaFuncAttributes &attribs,
|
|
cudaDeviceProp &devprop,
|
|
size_t bytesDynamicSharedMem,
|
|
size_t threadsPerBlock)
|
|
{
|
|
|
|
// Determine the maximum number of CTAs that can be run simultaneously for each kernel
|
|
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
|
|
const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
|
|
const unsigned int warpAllocationMultiple = 2;
|
|
const unsigned int smemAllocationUnit = 512; // in bytes
|
|
const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
|
|
const unsigned int maxBlocksPerSM = 8;
|
|
|
|
// Number of warps (round up to nearest whole multiple of warp size)
|
|
size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
|
|
// Round up to warp allocation multiple
|
|
numWarps = ceiling(numWarps, warpAllocationMultiple);
|
|
|
|
// Number of regs is regs per thread times number of warps times warp size
|
|
size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
|
|
// Round up to multiple of register allocation unit size
|
|
regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
|
|
|
|
size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem;
|
|
size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
|
|
|
|
size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
|
|
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
|
|
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
|
|
|
|
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
|
|
}
|
|
|
|
extern "C"
|
|
size_t maxBlocksFromPointer(void* kernel,
|
|
size_t bytesDynamicSharedMem,
|
|
size_t threadsPerBlock)
|
|
{
|
|
cudaDeviceProp devprop;
|
|
int deviceID = -1;
|
|
cudaError_t err = cudaGetDevice(&deviceID);
|
|
if (err == cudaSuccess)
|
|
{
|
|
err = cudaGetDeviceProperties(&devprop, deviceID);
|
|
if (err != cudaSuccess)
|
|
return -1;
|
|
|
|
cudaFuncAttributes attr;
|
|
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
|
|
if (err != cudaSuccess)
|
|
return -1;
|
|
|
|
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
|
|
}
|
|
|
|
return -1;
|
|
}
|