forked from lijiext/lammps
620 lines
24 KiB
Plaintext
620 lines
24 KiB
Plaintext
// -------------------------------------------------------------
|
|
// cuDPP -- CUDA Data Parallel Primitives library
|
|
// -------------------------------------------------------------
|
|
// $Revision: 5633 $
|
|
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
|
|
// -------------------------------------------------------------
|
|
// This source code is distributed under the terms of license.txt
|
|
// in the root directory of this source distribution.
|
|
// -------------------------------------------------------------
|
|
|
|
/**
|
|
* @file
|
|
* scan_cta.cu
|
|
*
|
|
* @brief CUDPP CTA-level scan routines
|
|
*/
|
|
|
|
/** \defgroup cudpp_cta CUDPP CTA-Level API
|
|
* The CUDPP CTA-Level API contains functions that run on the GPU
|
|
* device. These are CUDA \c __device__ functions that are called
|
|
* from within other CUDA device functions (typically
|
|
* \link cudpp_kernel CUDPP Kernel-Level API\endlink functions).
|
|
* They are called CTA-level functions because they typically process
|
|
* s_data "owned" by each CTA within shared memory, and are agnostic of
|
|
* any other CTAs that may be running (or how many CTAs are running),
|
|
* other than to compute appropriate global memory addresses.
|
|
* @{
|
|
*/
|
|
|
|
/** @name Scan Functions
|
|
* @{
|
|
*/
|
|
|
|
#include <cudpp_globals.h>
|
|
#include <cudpp_util.h>
|
|
#include <math.h>
|
|
#include <cudpp.h>
|
|
|
|
/**
|
|
* @brief Macro to insert necessary __syncthreads() in device emulation mode
|
|
*/
|
|
#ifdef __DEVICE_EMULATION__
|
|
#define __EMUSYNC __syncthreads()
|
|
#else
|
|
#define __EMUSYNC
|
|
#endif
|
|
|
|
/**
|
|
* @brief Template class containing compile-time parameters to the scan functions
|
|
*
|
|
* ScanTraits is passed as a template parameter to all scan functions. By
|
|
* using these compile-time functions we can enable generic code while
|
|
* maintaining the highest performance. This is crucial for the performance
|
|
* of low-level workhorse algorithms like scan.
|
|
*
|
|
* @param T The datatype of the scan
|
|
* @param oper The ::CUDPPOperator to use for the scan (add, max, etc.)
|
|
* @param multiRow True if this is a multi-row scan
|
|
* @param unroll True if scan inner loops should be unrolled
|
|
* @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans)
|
|
* @param backward True if this is a backward scan
|
|
* @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements)
|
|
* @param exclusive True for exclusive scans, false for inclusive scans
|
|
*/
|
|
template <class T, CUDPPOperator oper, bool backward, bool exclusive,
|
|
bool multiRow, bool sums, bool fullBlock>
|
|
class ScanTraits
|
|
{
|
|
public:
|
|
|
|
//! Returns true if this is a backward scan
|
|
static __device__ bool isBackward() { return backward; };
|
|
//! Returns true if this is an exclusive scan
|
|
static __device__ bool isExclusive() { return exclusive; };
|
|
//! Returns true if this a multi-row scan.
|
|
static __device__ bool isMultiRow() { return multiRow; };
|
|
//! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans)
|
|
static __device__ bool writeSums() { return sums; };
|
|
//! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements
|
|
static __device__ bool isFullBlock() { return fullBlock; };
|
|
|
|
|
|
//! The operator function used for the scan
|
|
static __device__ T op(const T a, const T b)
|
|
{
|
|
return Operator<T, oper>::op(a, b);
|
|
}
|
|
|
|
//! The identity value used by the scan
|
|
static __device__ T identity() { return Operator<T, oper>::identity(); }
|
|
};
|
|
|
|
//! This is used to insert syncthreads to avoid perf loss caused by 128-bit
|
|
//! load overlap that happens on G80. This gives about a 15% boost on scans on
|
|
//! G80.
|
|
//! @todo Parameterize this in case this perf detail changes on future GPUs.
|
|
#define DISALLOW_LOADSTORE_OVERLAP 1
|
|
|
|
/**
|
|
* @brief Handles loading input s_data from global memory to shared memory
|
|
* (vec4 version)
|
|
*
|
|
* Load a chunk of 8*blockDim.x elements from global memory into a
|
|
* shared memory array. Each thread loads two T4 elements (where
|
|
* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in
|
|
* thread local arrays (in registers), and writes the two total sums of the
|
|
* vec4s into shared memory, where they will be cooperatively scanned with
|
|
* the other partial sums by all threads in the CTA.
|
|
*
|
|
* @param[out] s_out The output (shared) memory array
|
|
* @param[out] threadScan0 Intermediate per-thread partial sums array 1
|
|
* @param[out] threadScan1 Intermediate per-thread partial sums array 2
|
|
* @param[in] d_in The input (device) memory array
|
|
* @param[in] numElements The number of elements in the array being scanned
|
|
* @param[in] iDataOffset the offset of the input array in global memory for this
|
|
* thread block
|
|
* @param[out] ai The shared memory address for the thread's first element
|
|
* (returned for reuse)
|
|
* @param[out] bi The shared memory address for the thread's second element
|
|
* (returned for reuse)
|
|
* @param[out] aiDev The device memory address for this thread's first element
|
|
* (returned for reuse)
|
|
* @param[out] biDev The device memory address for this thread's second element
|
|
* (returned for reuse)
|
|
*/
|
|
template <class T, class traits>
|
|
__device__ void loadSharedChunkFromMem4(T *s_out,
|
|
T threadScan0[4],
|
|
T threadScan1[4],
|
|
const T *d_in,
|
|
int numElements,
|
|
int iDataOffset,
|
|
int &ai,
|
|
int &bi,
|
|
int &aiDev,
|
|
int &biDev)
|
|
{
|
|
int thid = threadIdx.x;
|
|
aiDev = iDataOffset + thid;
|
|
biDev = aiDev + blockDim.x;
|
|
|
|
// convert to 4-vector
|
|
typename typeToVector<T,4>::Result tempData;
|
|
typename typeToVector<T,4>::Result* inData = (typename typeToVector<T,4>::Result*)d_in;
|
|
|
|
ai = thid;
|
|
bi = thid + blockDim.x;
|
|
|
|
// read into tempData;
|
|
if (traits::isBackward())
|
|
{
|
|
int i = aiDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
tempData = inData[aiDev];
|
|
threadScan0[3] = tempData.w;
|
|
threadScan0[2] = traits::op(tempData.z, threadScan0[3]);
|
|
threadScan0[1] = traits::op(tempData.y, threadScan0[2]);
|
|
threadScan0[0] = s_out[ai]
|
|
= traits::op(tempData.x, threadScan0[1]);
|
|
}
|
|
else
|
|
{
|
|
threadScan0[3] = traits::identity();
|
|
threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]);
|
|
threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]);
|
|
threadScan0[0] = s_out[ai]
|
|
= traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan0[1]);
|
|
}
|
|
|
|
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
|
__syncthreads();
|
|
#endif
|
|
|
|
i = biDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
tempData = inData[biDev];
|
|
threadScan1[3] = tempData.w;
|
|
threadScan1[2] = traits::op(tempData.z, threadScan1[3]);
|
|
threadScan1[1] = traits::op(tempData.y, threadScan1[2]);
|
|
threadScan1[0] = s_out[bi]
|
|
= traits::op(tempData.x, threadScan1[1]);
|
|
}
|
|
else
|
|
{
|
|
threadScan1[3] = traits::identity();
|
|
threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]);
|
|
threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]);
|
|
threadScan1[0] = s_out[bi]
|
|
= traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan1[1]);
|
|
}
|
|
__syncthreads();
|
|
|
|
// reverse s_data in shared memory
|
|
if (ai < CTA_SIZE)
|
|
{
|
|
unsigned int leftIdx = ai;
|
|
unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
|
|
|
|
if (leftIdx < rightIdx)
|
|
{
|
|
T tmp = s_out[leftIdx];
|
|
s_out[leftIdx] = s_out[rightIdx];
|
|
s_out[rightIdx] = tmp;
|
|
}
|
|
}
|
|
__syncthreads();
|
|
}
|
|
else
|
|
{
|
|
int i = aiDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
tempData = inData[aiDev];
|
|
threadScan0[0] = tempData.x;
|
|
threadScan0[1] = traits::op(tempData.y, threadScan0[0]);
|
|
threadScan0[2] = traits::op(tempData.z, threadScan0[1]);
|
|
threadScan0[3] = s_out[ai]
|
|
= traits::op(tempData.w, threadScan0[2]);
|
|
}
|
|
else
|
|
{
|
|
threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity();
|
|
threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]);
|
|
threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]);
|
|
threadScan0[3] = s_out[ai]
|
|
= traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]);
|
|
}
|
|
|
|
|
|
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
|
__syncthreads();
|
|
#endif
|
|
|
|
i = biDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
tempData = inData[biDev];
|
|
threadScan1[0] = tempData.x;
|
|
threadScan1[1] = traits::op(tempData.y, threadScan1[0]);
|
|
threadScan1[2] = traits::op(tempData.z, threadScan1[1]);
|
|
threadScan1[3] = s_out[bi]
|
|
= traits::op(tempData.w, threadScan1[2]);
|
|
}
|
|
else
|
|
{
|
|
threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity();
|
|
threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]);
|
|
threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]);
|
|
threadScan1[3] = s_out[bi]
|
|
= traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]);
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* @brief Handles storing result s_data from shared memory to global memory
|
|
* (vec4 version)
|
|
*
|
|
* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory
|
|
* into a device memory array. Each thread stores reads two elements from shared
|
|
* memory, adds them to the intermediate sums computed in
|
|
* loadSharedChunkFromMem4(), and writes two T4 elements (where
|
|
* T4 is, e.g. int4 or float4) to global memory.
|
|
*
|
|
* @param[out] d_out The output (device) memory array
|
|
* @param[in] threadScan0 Intermediate per-thread partial sums array 1
|
|
* (contents computed in loadSharedChunkFromMem4())
|
|
* @param[in] threadScan1 Intermediate per-thread partial sums array 2
|
|
* (contents computed in loadSharedChunkFromMem4())
|
|
* @param[in] s_in The input (shared) memory array
|
|
* @param[in] numElements The number of elements in the array being scanned
|
|
* @param[in] oDataOffset the offset of the output array in global memory
|
|
* for this thread block
|
|
* @param[in] ai The shared memory address for the thread's first element
|
|
* (computed in loadSharedChunkFromMem4())
|
|
* @param[in] bi The shared memory address for the thread's second element
|
|
* (computed in loadSharedChunkFromMem4())
|
|
* @param[in] aiDev The device memory address for this thread's first element
|
|
* (computed in loadSharedChunkFromMem4())
|
|
* @param[in] biDev The device memory address for this thread's second element
|
|
* (computed in loadSharedChunkFromMem4())
|
|
*/
|
|
template <class T, class traits>
|
|
__device__ void storeSharedChunkToMem4(T *d_out,
|
|
T threadScan0[4],
|
|
T threadScan1[4],
|
|
T *s_in,
|
|
int numElements,
|
|
int oDataOffset,
|
|
int ai,
|
|
int bi,
|
|
int aiDev,
|
|
int biDev)
|
|
{
|
|
// Convert to 4-vector
|
|
typename typeToVector<T,4>::Result tempData;
|
|
typename typeToVector<T,4>::Result* outData = (typename typeToVector<T,4>::Result*)d_out;
|
|
|
|
// write results to global memory
|
|
if (traits::isBackward())
|
|
{
|
|
if (ai < CTA_SIZE)
|
|
{
|
|
|
|
unsigned int leftIdx = ai;
|
|
unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
|
|
|
|
if (leftIdx < rightIdx)
|
|
{
|
|
T tmp = s_in[leftIdx];
|
|
s_in[leftIdx] = s_in[rightIdx];
|
|
s_in[rightIdx] = tmp;
|
|
}
|
|
}
|
|
__syncthreads();
|
|
|
|
T temp = s_in[ai];
|
|
|
|
if (traits::isExclusive())
|
|
{
|
|
tempData.w = temp;
|
|
tempData.z = traits::op(temp, threadScan0[3]);
|
|
tempData.y = traits::op(temp, threadScan0[2]);
|
|
tempData.x = traits::op(temp, threadScan0[1]);
|
|
}
|
|
else
|
|
{
|
|
tempData.w = traits::op(temp, threadScan0[3]);
|
|
tempData.z = traits::op(temp, threadScan0[2]);
|
|
tempData.y = traits::op(temp, threadScan0[1]);
|
|
tempData.x = traits::op(temp, threadScan0[0]);
|
|
}
|
|
|
|
int i = aiDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
outData[aiDev] = tempData;
|
|
}
|
|
else
|
|
{
|
|
if (i < numElements) { d_out[i] = tempData.x;
|
|
if (i+1 < numElements) { d_out[i+1] = tempData.y;
|
|
if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
|
|
}
|
|
|
|
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
|
__syncthreads();
|
|
#endif
|
|
|
|
temp = s_in[bi];
|
|
|
|
if (traits::isExclusive())
|
|
{
|
|
tempData.w = temp;
|
|
tempData.z = traits::op(temp, threadScan1[3]);
|
|
tempData.y = traits::op(temp, threadScan1[2]);
|
|
tempData.x = traits::op(temp, threadScan1[1]);
|
|
}
|
|
else
|
|
{
|
|
tempData.w = traits::op(temp, threadScan1[3]);
|
|
tempData.z = traits::op(temp, threadScan1[2]);
|
|
tempData.y = traits::op(temp, threadScan1[1]);
|
|
tempData.x = traits::op(temp, threadScan1[0]);
|
|
}
|
|
|
|
i = biDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
outData[biDev] = tempData;
|
|
}
|
|
else
|
|
{
|
|
if (i < numElements) { d_out[i] = tempData.x;
|
|
if (i+1 < numElements) { d_out[i+1] = tempData.y;
|
|
if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
T temp;
|
|
temp = s_in[ai];
|
|
|
|
if (traits::isExclusive())
|
|
{
|
|
tempData.x = temp;
|
|
tempData.y = traits::op(temp, threadScan0[0]);
|
|
tempData.z = traits::op(temp, threadScan0[1]);
|
|
tempData.w = traits::op(temp, threadScan0[2]);
|
|
}
|
|
else
|
|
{
|
|
tempData.x = traits::op(temp, threadScan0[0]);
|
|
tempData.y = traits::op(temp, threadScan0[1]);
|
|
tempData.z = traits::op(temp, threadScan0[2]);
|
|
tempData.w = traits::op(temp, threadScan0[3]);
|
|
}
|
|
|
|
int i = aiDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
outData[aiDev] = tempData;
|
|
}
|
|
else
|
|
{
|
|
// we can't use vec4 because the original array isn't a multiple of
|
|
// 4 elements
|
|
if ( i < numElements) { d_out[i] = tempData.x;
|
|
if ((i+1) < numElements) { d_out[i+1] = tempData.y;
|
|
if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
|
|
}
|
|
|
|
#ifdef DISALLOW_LOADSTORE_OVERLAP
|
|
__syncthreads();
|
|
#endif
|
|
|
|
temp = s_in[bi];
|
|
|
|
if (traits::isExclusive())
|
|
{
|
|
tempData.x = temp;
|
|
tempData.y = traits::op(temp, threadScan1[0]);
|
|
tempData.z = traits::op(temp, threadScan1[1]);
|
|
tempData.w = traits::op(temp, threadScan1[2]);
|
|
}
|
|
else
|
|
{
|
|
tempData.x = traits::op(temp, threadScan1[0]);
|
|
tempData.y = traits::op(temp, threadScan1[1]);
|
|
tempData.z = traits::op(temp, threadScan1[2]);
|
|
tempData.w = traits::op(temp, threadScan1[3]);
|
|
}
|
|
|
|
i = biDev * 4;
|
|
if (traits::isFullBlock() || i + 3 < numElements)
|
|
{
|
|
outData[biDev] = tempData;
|
|
}
|
|
else
|
|
{
|
|
// we can't use vec4 because the original array isn't a multiple of
|
|
// 4 elements
|
|
if ( i < numElements) { d_out[i] = tempData.x;
|
|
if ((i+1) < numElements) { d_out[i+1] = tempData.y;
|
|
if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
|
|
}
|
|
}
|
|
}
|
|
|
|
/** @brief Scan all warps of a CTA without synchronization
|
|
*
|
|
* The warp-scan algorithm breaks a block of data into warp-sized chunks, and
|
|
* scans the chunks independently with a warp of threads each. Because warps
|
|
* execute instructions in SIMD fashion, there is no need to synchronize in
|
|
* order to share data within a warp (only across warps). Also, in SIMD the
|
|
* most efficient algorithm is a step-efficient algorithm. Therefore, within
|
|
* each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps
|
|
* to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the
|
|
* work-efficient tree-based algorithm described by Guy Blelloch [1990] that
|
|
* takes 2 * log(N) steps and is in general more complex to implement.
|
|
* Previous versions of CUDPP used the Blelloch algorithm. For current GPUs,
|
|
* the warp size is 32, so this takes five steps per warp.
|
|
*
|
|
* Each thread is responsible for a single element of the array to be scanned.
|
|
* Each thread inputs a single value to the scan via \a val and returns
|
|
* its own scanned result element. The threads of each warp cooperate
|
|
* via the shared memory array \a s_data to scan WARP_SIZE elements.
|
|
*
|
|
* Template parameter \a maxlevel allows this warpscan to be performed on
|
|
* partial warps. For example, if only the first 8 elements of each warp need
|
|
* to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.
|
|
*
|
|
* The computation uses 2 * WARP_SIZE elements of shared memory per warp to
|
|
* enable warps to offset beyond their input data and receive the identity
|
|
* element without using any branch instructions.
|
|
*
|
|
* \note s_data is declared volatile here to prevent the compiler from
|
|
* optimizing away writes to shared memory, and ensure correct intrawarp
|
|
* communication in the absence of __syncthreads.
|
|
*
|
|
* @return The result of the warp scan for the current thread
|
|
* @param[in] val The current threads's input to the scan
|
|
* @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE
|
|
* elements used to compute the warp scans
|
|
*/
|
|
template<class T, class traits,int maxlevel>
|
|
__device__ T warpscan(T val, volatile T* s_data)
|
|
{
|
|
// The following is the same as 2 * 32 * warpId + threadInWarp =
|
|
// 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1))
|
|
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1));
|
|
s_data[idx] = traits::identity();
|
|
idx += WARP_SIZE;
|
|
T t = s_data[idx] = val; __EMUSYNC;
|
|
|
|
// This code is needed because the warp size of device emulation
|
|
// is only 1 thread, so sync-less cooperation within a warp doesn't
|
|
// work.
|
|
#ifdef __DEVICE_EMULATION__
|
|
t = s_data[idx - 1]; __EMUSYNC;
|
|
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
|
t = s_data[idx - 2]; __EMUSYNC;
|
|
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
|
t = s_data[idx - 4]; __EMUSYNC;
|
|
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
|
t = s_data[idx - 8]; __EMUSYNC;
|
|
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
|
t = s_data[idx - 16]; __EMUSYNC;
|
|
s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
|
|
#else
|
|
if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); }
|
|
if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); }
|
|
if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); }
|
|
if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); }
|
|
if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); }
|
|
#endif
|
|
|
|
return s_data[idx-1]; // convert inclusive -> exclusive
|
|
}
|
|
|
|
/** @brief Perform a full CTA scan using the warp-scan algorithm
|
|
*
|
|
* As described in the comment for warpscan(), the warp-scan algorithm breaks
|
|
* a block of data into warp-sized chunks, and scans the chunks independently
|
|
* with a warp of threads each. To complete the scan, each warp <i>j</i> then
|
|
* writes its last element to element <i>j</i> of a temporary shared array.
|
|
* Then a single warp exclusive-scans these "warp sums". Finally, each thread
|
|
* adds the result of the warp sum scan to the result of the scan from the
|
|
* first pass.
|
|
*
|
|
* Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan
|
|
* twice.
|
|
*
|
|
* @param x The first input value for the current thread
|
|
* @param y The second input value for the current thread
|
|
* @param s_data Temporary shared memory space of 2*CTA_SIZE elements for
|
|
* performing the scan
|
|
*/
|
|
template <class T, class traits>
|
|
__device__ void scanWarps(T x, T y,
|
|
T *s_data)
|
|
{
|
|
T val = warpscan<T, traits, 4>(x, s_data);
|
|
__syncthreads();
|
|
T val2 = warpscan<T, traits, 4>(y, s_data);
|
|
|
|
int idx = threadIdx.x;
|
|
|
|
if ((idx & 31)==31)
|
|
{
|
|
s_data[idx >> 5] = traits::op(val, x);
|
|
s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y);
|
|
}
|
|
__syncthreads();
|
|
|
|
#ifndef __DEVICE_EMULATION__
|
|
if (idx < 32)
|
|
#endif
|
|
{
|
|
s_data[idx] = warpscan<T,traits,(LOG_CTA_SIZE-LOG_WARP_SIZE+1)>(s_data[idx], s_data);
|
|
}
|
|
__syncthreads();
|
|
|
|
val = traits::op(val, s_data[idx >> 5]);
|
|
|
|
val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]);
|
|
|
|
__syncthreads();
|
|
|
|
s_data[idx] = val;
|
|
s_data[idx+blockDim.x] = val2;
|
|
}
|
|
|
|
/**
|
|
* @brief CTA-level scan routine; scans s_data in shared memory in each thread block
|
|
*
|
|
* This function is the main CTA-level scan function. It may be called by other
|
|
* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements.
|
|
* Each thread is responsible for one element in each half of the input array.
|
|
* \note This code is intended to be run on a CTA of 128 threads. Other sizes are
|
|
* untested.
|
|
*
|
|
* @param[in] s_data The array to be scanned in shared memory
|
|
* @param[out] d_blockSums Array of per-block sums
|
|
* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum
|
|
*/
|
|
template <class T, class traits>
|
|
__device__ void scanCTA(T *s_data,
|
|
T *d_blockSums,
|
|
unsigned int blockSumIndex)
|
|
{
|
|
T val = s_data[threadIdx.x];
|
|
T val2 = s_data[threadIdx.x + blockDim.x];
|
|
__syncthreads();
|
|
|
|
scanWarps<T,traits>(val, val2, s_data);
|
|
__syncthreads();
|
|
|
|
if (traits::writeSums() && threadIdx.x == blockDim.x - 1)
|
|
{
|
|
d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]);
|
|
}
|
|
|
|
|
|
#ifdef __DEVICE_EMULATION__
|
|
// must sync in emulation mode when doing backward scans, because otherwise the
|
|
// shared memory array will get reversed before the block sums are read!
|
|
if (traits::isBackward())
|
|
__syncthreads();
|
|
#endif
|
|
}
|
|
|
|
|
|
/** @} */ // end scan functions
|
|
/** @} */ // end cudpp_cta
|