lammps/lib/gpu/cudpp_mini/cta/scan_cta.cu

// -------------------------------------------------------------
//  cuDPP -- CUDA Data Parallel Primitives library
//  -------------------------------------------------------------
//  $Revision: 5633 $
//  $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------

/**
 * @file
 * scan_cta.cu
 *
 * @brief CUDPP CTA-level scan routines
 */

/** \defgroup cudpp_cta CUDPP CTA-Level API
  * The CUDPP CTA-Level API contains functions that run on the GPU
  * device.  These are CUDA \c __device__ functions that are called
  * from within other CUDA device functions (typically
  * \link cudpp_kernel CUDPP Kernel-Level API\endlink functions).
  * They are called CTA-level functions because they typically process
  * s_data "owned" by each CTA within shared memory, and are agnostic of
  * any other CTAs that may be running (or how many CTAs are running),
  * other than to compute appropriate global memory addresses.
  * @{
  */

/** @name Scan Functions
* @{
*/

#include <cudpp_globals.h>
#include <cudpp_util.h>
#include <math.h>
#include <cudpp.h>

/**
 * @brief Macro to insert necessary __syncthreads() in device emulation mode
 */
#ifdef __DEVICE_EMULATION__
#define __EMUSYNC __syncthreads()
#else
#define __EMUSYNC
#endif

/**
  * @brief Template class containing compile-time parameters to the scan functions
  *
  * ScanTraits is passed as a template parameter to all scan functions.  By
  * using these compile-time functions we can enable generic code while
  * maintaining the highest performance.  This is crucial for the performance
  * of low-level workhorse algorithms like scan.
  *
  * @param T The datatype of the scan
  * @param oper The ::CUDPPOperator to use for the scan (add, max, etc.)
  * @param multiRow True if this is a multi-row scan
  * @param unroll True if scan inner loops should be unrolled
  * @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans)
  * @param backward True if this is a backward scan
  * @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements)
  * @param exclusive True for exclusive scans, false for inclusive scans
  */
template <class T, CUDPPOperator oper, bool backward, bool exclusive,
          bool multiRow, bool sums, bool fullBlock>
class ScanTraits
{
public:

    //! Returns true if this is a backward scan
    static __device__ bool isBackward()    { return backward; };
    //! Returns true if this is an exclusive scan
    static __device__ bool isExclusive()  { return exclusive; };
    //! Returns true if this a multi-row scan.
    static __device__ bool isMultiRow()    { return multiRow; };
    //! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans)
    static __device__ bool writeSums()     { return sums; };
    //! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements
    static __device__ bool isFullBlock()   { return fullBlock; };


    //! The operator function used for the scan
    static __device__ T op(const T a, const T b)
    {
        return Operator<T, oper>::op(a, b);
    }

    //! The identity value used by the scan
    static __device__ T identity() { return Operator<T, oper>::identity(); }
};

//! This is used to insert syncthreads to avoid perf loss caused by 128-bit
//! load overlap that happens on G80.  This gives about a 15% boost on scans on
//! G80.
//! @todo Parameterize this in case this perf detail changes on future GPUs.
#define DISALLOW_LOADSTORE_OVERLAP 1

/**
* @brief Handles loading input s_data from global memory to shared memory
* (vec4 version)
*
* Load a chunk of 8*blockDim.x elements from global memory into a
* shared memory array.  Each thread loads two T4 elements (where
* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in
* thread local arrays (in registers), and writes the two total sums of the
* vec4s into shared memory, where they will be cooperatively scanned with
* the other partial sums by all threads in the CTA.
*
* @param[out] s_out The output (shared) memory array
* @param[out] threadScan0 Intermediate per-thread partial sums array 1
* @param[out] threadScan1 Intermediate per-thread partial sums array 2
* @param[in] d_in The input (device) memory array
* @param[in] numElements The number of elements in the array being scanned
* @param[in] iDataOffset the offset of the input array in global memory for this
* thread block
* @param[out] ai The shared memory address for the thread's first element
* (returned for reuse)
* @param[out] bi The shared memory address for the thread's second element
* (returned for reuse)
* @param[out] aiDev The device memory address for this thread's first element
* (returned for reuse)
* @param[out] biDev The device memory address for this thread's second element
* (returned for reuse)
*/
template <class T, class traits>
__device__ void loadSharedChunkFromMem4(T        *s_out,
                                        T        threadScan0[4],
                                        T        threadScan1[4],
                                        const T  *d_in,
                                        int      numElements,
                                        int      iDataOffset,
                                        int      &ai,
                                        int      &bi,
                                        int      &aiDev,
                                        int      &biDev)
{
    int thid = threadIdx.x;
    aiDev = iDataOffset + thid;
    biDev = aiDev + blockDim.x;

    // convert to 4-vector
    typename typeToVector<T,4>::Result  tempData;
    typename typeToVector<T,4>::Result* inData = (typename typeToVector<T,4>::Result*)d_in;

    ai = thid;
    bi = thid + blockDim.x;

    // read into tempData;
    if (traits::isBackward())
    {
        int i = aiDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            tempData       = inData[aiDev];
            threadScan0[3] = tempData.w;
            threadScan0[2] = traits::op(tempData.z, threadScan0[3]);
            threadScan0[1] = traits::op(tempData.y, threadScan0[2]);
            threadScan0[0] = s_out[ai]
                           = traits::op(tempData.x, threadScan0[1]);
        }
        else
        {
            threadScan0[3] = traits::identity();
            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]);
            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]);
            threadScan0[0] = s_out[ai]
                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan0[1]);
        }

#ifdef DISALLOW_LOADSTORE_OVERLAP
        __syncthreads();
#endif

        i = biDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            tempData       = inData[biDev];
            threadScan1[3] = tempData.w;
            threadScan1[2] = traits::op(tempData.z, threadScan1[3]);
            threadScan1[1] = traits::op(tempData.y, threadScan1[2]);
            threadScan1[0] = s_out[bi]
                           = traits::op(tempData.x, threadScan1[1]);
        }
        else
        {
            threadScan1[3] = traits::identity();
            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]);
            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]);
            threadScan1[0] = s_out[bi]
                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan1[1]);
        }
        __syncthreads();

        // reverse s_data in shared memory
        if (ai < CTA_SIZE)
        {
            unsigned int leftIdx = ai;
            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;

            if (leftIdx < rightIdx)
            {
                T tmp           = s_out[leftIdx];
                s_out[leftIdx]  = s_out[rightIdx];
                s_out[rightIdx] = tmp;
            }
        }
        __syncthreads();
    }
    else
    {
        int i = aiDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            tempData       = inData[aiDev];
            threadScan0[0] = tempData.x;
            threadScan0[1] = traits::op(tempData.y, threadScan0[0]);
            threadScan0[2] = traits::op(tempData.z, threadScan0[1]);
            threadScan0[3] = s_out[ai]
                           = traits::op(tempData.w, threadScan0[2]);
        }
        else
        {
            threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity();
            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]);
            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]);
            threadScan0[3] = s_out[ai]
                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]);
        }


#ifdef DISALLOW_LOADSTORE_OVERLAP
        __syncthreads();
#endif

        i = biDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            tempData       = inData[biDev];
            threadScan1[0] = tempData.x;
            threadScan1[1] = traits::op(tempData.y, threadScan1[0]);
            threadScan1[2] = traits::op(tempData.z, threadScan1[1]);
            threadScan1[3] = s_out[bi]
                           = traits::op(tempData.w, threadScan1[2]);
        }
        else
        {
            threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity();
            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]);
            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]);
            threadScan1[3] = s_out[bi]
                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]);
        }
        __syncthreads();
    }
}


/**
* @brief Handles storing result s_data from shared memory to global memory
* (vec4 version)
*
* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory
* into a device memory array.  Each thread stores reads two elements from shared
* memory, adds them to the intermediate sums computed in
* loadSharedChunkFromMem4(), and writes two T4 elements (where
* T4 is, e.g. int4 or float4) to global memory.
*
* @param[out] d_out The output (device) memory array
* @param[in] threadScan0 Intermediate per-thread partial sums array 1
* (contents computed in loadSharedChunkFromMem4())
* @param[in] threadScan1 Intermediate per-thread partial sums array 2
* (contents computed in loadSharedChunkFromMem4())
* @param[in] s_in The input (shared) memory array
* @param[in] numElements The number of elements in the array being scanned
* @param[in] oDataOffset the offset of the output array in global memory
* for this thread block
* @param[in] ai The shared memory address for the thread's first element
* (computed in loadSharedChunkFromMem4())
* @param[in] bi The shared memory address for the thread's second element
* (computed in loadSharedChunkFromMem4())
* @param[in] aiDev The device memory address for this thread's first element
* (computed in loadSharedChunkFromMem4())
* @param[in] biDev The device memory address for this thread's second element
* (computed in loadSharedChunkFromMem4())
*/
template <class T, class traits>
__device__ void storeSharedChunkToMem4(T   *d_out,
                                       T   threadScan0[4],
                                       T   threadScan1[4],
                                       T   *s_in,
                                       int numElements,
                                       int oDataOffset,
                                       int ai,
                                       int bi,
                                       int aiDev,
                                       int biDev)
{
    // Convert to 4-vector
    typename typeToVector<T,4>::Result tempData;
    typename typeToVector<T,4>::Result* outData = (typename typeToVector<T,4>::Result*)d_out;

    // write results to global memory
    if (traits::isBackward())
    {
        if (ai < CTA_SIZE)
        {

            unsigned int leftIdx = ai;
            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;

            if (leftIdx < rightIdx)
            {
                T tmp = s_in[leftIdx];
                s_in[leftIdx] = s_in[rightIdx];
                s_in[rightIdx] = tmp;
            }
        }
        __syncthreads();

        T temp = s_in[ai];

        if (traits::isExclusive())
        {
            tempData.w = temp;
            tempData.z = traits::op(temp, threadScan0[3]);
            tempData.y = traits::op(temp, threadScan0[2]);
            tempData.x = traits::op(temp, threadScan0[1]);
        }
        else
        {
            tempData.w = traits::op(temp, threadScan0[3]);
            tempData.z = traits::op(temp, threadScan0[2]);
            tempData.y = traits::op(temp, threadScan0[1]);
            tempData.x = traits::op(temp, threadScan0[0]);
        }

        int i = aiDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            outData[aiDev] = tempData;
        }
        else
        {
            if (i   < numElements) { d_out[i]   = tempData.x;
            if (i+1 < numElements) { d_out[i+1] = tempData.y;
            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
        }

#ifdef DISALLOW_LOADSTORE_OVERLAP
        __syncthreads();
#endif

        temp = s_in[bi];

        if (traits::isExclusive())
        {
            tempData.w = temp;
            tempData.z = traits::op(temp, threadScan1[3]);
            tempData.y = traits::op(temp, threadScan1[2]);
            tempData.x = traits::op(temp, threadScan1[1]);
        }
        else
        {
            tempData.w = traits::op(temp, threadScan1[3]);
            tempData.z = traits::op(temp, threadScan1[2]);
            tempData.y = traits::op(temp, threadScan1[1]);
            tempData.x = traits::op(temp, threadScan1[0]);
        }

        i = biDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            outData[biDev] = tempData;
        }
        else
        {
            if (i   < numElements) { d_out[i]   = tempData.x;
            if (i+1 < numElements) { d_out[i+1] = tempData.y;
            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
        }
    }
    else
    {
        T temp;
        temp = s_in[ai];

        if (traits::isExclusive())
        {
            tempData.x = temp;
            tempData.y = traits::op(temp, threadScan0[0]);
            tempData.z = traits::op(temp, threadScan0[1]);
            tempData.w = traits::op(temp, threadScan0[2]);
        }
        else
        {
            tempData.x = traits::op(temp, threadScan0[0]);
            tempData.y = traits::op(temp, threadScan0[1]);
            tempData.z = traits::op(temp, threadScan0[2]);
            tempData.w = traits::op(temp, threadScan0[3]);
        }

        int i = aiDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            outData[aiDev] = tempData;
        }
        else
        {
            // we can't use vec4 because the original array isn't a multiple of
            // 4 elements
            if ( i    < numElements) { d_out[i]   = tempData.x;
            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
        }

#ifdef DISALLOW_LOADSTORE_OVERLAP
        __syncthreads();
#endif

        temp       = s_in[bi];

        if (traits::isExclusive())
        {
            tempData.x = temp;
            tempData.y = traits::op(temp, threadScan1[0]);
            tempData.z = traits::op(temp, threadScan1[1]);
            tempData.w = traits::op(temp, threadScan1[2]);
        }
        else
        {
            tempData.x = traits::op(temp, threadScan1[0]);
            tempData.y = traits::op(temp, threadScan1[1]);
            tempData.z = traits::op(temp, threadScan1[2]);
            tempData.w = traits::op(temp, threadScan1[3]);
        }

        i = biDev * 4;
        if (traits::isFullBlock() || i + 3 < numElements)
        {
            outData[biDev] = tempData;
        }
        else
        {
            // we can't use vec4 because the original array isn't a multiple of
            // 4 elements
            if ( i    < numElements) { d_out[i]   = tempData.x;
            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
        }
    }
}

/** @brief Scan all warps of a CTA without synchronization
  *
  * The warp-scan algorithm breaks a block of data into warp-sized chunks, and
  * scans the chunks independently with a warp of threads each.  Because warps
  * execute instructions in SIMD fashion, there is no need to synchronize in
  * order to share data within a warp (only across warps).  Also, in SIMD the
  * most efficient algorithm is a step-efficient algorithm.  Therefore, within
  * each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps
  * to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the
  * work-efficient tree-based algorithm described by Guy Blelloch [1990] that
  * takes 2 * log(N) steps and is in general more complex to implement.
  * Previous versions of CUDPP used the Blelloch algorithm.  For current GPUs,
  * the warp size is 32, so this takes five steps per warp.
  *
  * Each thread is responsible for a single element of the array to be scanned.
  * Each thread inputs a single value to the scan via \a val and returns
  * its own scanned result element.  The threads of each warp cooperate
  * via the shared memory array \a s_data to scan WARP_SIZE elements.
  *
  * Template parameter \a maxlevel allows this warpscan to be performed on
  * partial warps.  For example, if only the first 8 elements of each warp need
  * to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.
  *
  * The computation uses 2 * WARP_SIZE elements of shared memory per warp to
  * enable warps to offset beyond their input data and receive the identity
  * element without using any branch instructions.
  *
  * \note s_data is declared volatile here to prevent the compiler from
  * optimizing away writes to shared memory, and ensure correct intrawarp
  * communication in the absence of __syncthreads.
  *
  * @return The result of the warp scan for the current thread
  * @param[in] val The current threads's input to the scan
  * @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE
  * elements used to compute the warp scans
  */
template<class T, class traits,int maxlevel>
__device__ T warpscan(T val, volatile T* s_data)
{
    // The following is the same as 2 * 32 * warpId + threadInWarp =
    // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1))
    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1));
    s_data[idx] = traits::identity();
    idx += WARP_SIZE;
    T t = s_data[idx] = val;  __EMUSYNC;

        // This code is needed because the warp size of device emulation
        // is only 1 thread, so sync-less cooperation within a warp doesn't
        // work.
#ifdef __DEVICE_EMULATION__
    t = s_data[idx -  1]; __EMUSYNC;
    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
    t = s_data[idx -  2]; __EMUSYNC;
    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
    t = s_data[idx -  4]; __EMUSYNC;
    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
    t = s_data[idx -  8]; __EMUSYNC;
    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
    t = s_data[idx - 16]; __EMUSYNC;
    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
#else
    if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); }
    if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); }
    if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); }
    if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); }
    if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); }
#endif

    return s_data[idx-1];      // convert inclusive -> exclusive
}

/** @brief Perform a full CTA scan using the warp-scan algorithm
  *
  * As described in the comment for warpscan(), the warp-scan algorithm breaks
  * a block of data into warp-sized chunks, and scans the chunks independently
  * with a warp of threads each.  To complete the scan, each warp <i>j</i> then
  * writes its last element to element <i>j</i> of a temporary shared array.
  * Then a single warp exclusive-scans these "warp sums".  Finally, each thread
  * adds the result of the warp sum scan to the result of the scan from the
  * first pass.
  *
  * Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan
  * twice.
  *
  * @param x The first input value for the current thread
  * @param y The second input value for the current thread
  * @param s_data Temporary shared memory space of 2*CTA_SIZE elements for
  * performing the scan
  */
template <class T, class traits>
__device__ void scanWarps(T x, T y,
                          T *s_data)
{
    T val  = warpscan<T, traits, 4>(x, s_data);
    __syncthreads();
    T val2 = warpscan<T, traits, 4>(y, s_data);

    int idx = threadIdx.x;

    if ((idx & 31)==31)
    {
        s_data[idx >> 5]                = traits::op(val, x);
        s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y);
    }
    __syncthreads();

#ifndef __DEVICE_EMULATION__
    if (idx < 32)
#endif
    {
        s_data[idx] = warpscan<T,traits,(LOG_CTA_SIZE-LOG_WARP_SIZE+1)>(s_data[idx], s_data);
    }
    __syncthreads();

    val  = traits::op(val, s_data[idx >> 5]);

    val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]);

    __syncthreads();

    s_data[idx] = val;
    s_data[idx+blockDim.x] = val2;
}

/**
* @brief CTA-level scan routine; scans s_data in shared memory in each thread block
*
* This function is the main CTA-level scan function.  It may be called by other
* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements.
* Each thread is responsible for one element in each half of the input array.
* \note This code is intended to be run on a CTA of 128 threads.  Other sizes are
* untested.
*
* @param[in] s_data The array to be scanned in shared memory
* @param[out] d_blockSums Array of per-block sums
* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum
*/
template <class T, class traits>
__device__ void scanCTA(T            *s_data,
                        T            *d_blockSums,
                        unsigned int blockSumIndex)
{
    T val  = s_data[threadIdx.x];
    T val2 = s_data[threadIdx.x + blockDim.x];
    __syncthreads();

    scanWarps<T,traits>(val, val2, s_data);
    __syncthreads();

    if (traits::writeSums() && threadIdx.x == blockDim.x - 1)
    {
        d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]);
    }


#ifdef __DEVICE_EMULATION__
    // must sync in emulation mode when doing backward scans, because otherwise the
    // shared memory array will get reversed before the block sums are read!
    if (traits::isBackward())
        __syncthreads();
#endif
}


/** @} */ // end scan functions
/** @} */ // end cudpp_cta