forked from lijiext/lammps
772 lines
31 KiB
Plaintext
772 lines
31 KiB
Plaintext
// -------------------------------------------------------------
|
|
// CUDPP -- CUDA Data Parallel Primitives library
|
|
// -------------------------------------------------------------
|
|
// $Revision: 5633 $
|
|
// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
|
|
// -------------------------------------------------------------
|
|
// This source code is distributed under the terms of license.txt
|
|
// in the root directory of this source distribution.
|
|
// -------------------------------------------------------------
|
|
|
|
/**
|
|
* @file
|
|
* scan_app.cu
|
|
*
|
|
* @brief CUDPP application-level scan routines
|
|
*/
|
|
|
|
/** \defgroup cudpp_app CUDPP Application-Level API
|
|
* The CUDPP Application-Level API contains functions
|
|
* that run on the host CPU and invoke GPU routines in
|
|
* the CUDPP \link cudpp_kernel Kernel-Level API\endlink.
|
|
* Application-Level API functions are used by
|
|
* CUDPP \link publicInterface Public Interface\endlink
|
|
* functions to implement CUDPP's core functionality.
|
|
* @{
|
|
*/
|
|
|
|
/** @name Scan Functions
|
|
* @{
|
|
*/
|
|
|
|
#include "cudpp.h"
|
|
#include "cudpp_util.h"
|
|
#include "cudpp_plan.h"
|
|
#include "kernel/scan_kernel.cu"
|
|
#include "kernel/vector_kernel.cu"
|
|
|
|
|
|
#include <cutil.h>
|
|
#include <cstdlib>
|
|
#include <cstdio>
|
|
#include <assert.h>
|
|
|
|
/** @brief Perform recursive scan on arbitrary size arrays
|
|
*
|
|
* This is the CPU-side workhorse function of the scan engine. This function
|
|
* invokes the CUDA kernels which perform the scan on individual blocks.
|
|
*
|
|
* Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans,
|
|
* where each block is scanned by a single CUDA thread block. At each recursive level of the
|
|
* scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level
|
|
* has more than one block, it calls itself recursively. On returning from each recursive level,
|
|
* the total sum of each block from the level below is added to all elements of the corresponding
|
|
* block in this level. See "Parallel Prefix Sum (Scan) in CUDA" for more information (see
|
|
* \ref references ).
|
|
*
|
|
* Template parameter \a T is the datatype; \a isBackward specifies backward or forward scan;
|
|
* \a isExclusive specifies exclusive or inclusive scan, and \a op specifies the binary associative
|
|
* operator to be used.
|
|
*
|
|
* @param[out] d_out The output array for the scan results
|
|
* @param[in] d_in The input array to be scanned
|
|
* @param[out] d_blockSums Array of arrays of per-block sums (one array per recursive level, allocated
|
|
* by allocScanStorage())
|
|
* @param[in] numElements The number of elements in the array to scan
|
|
* @param[in] numRows The number of rows in the array to scan
|
|
* @param[in] rowPitches Array of row pitches (one array per recursive level, allocated by
|
|
* allocScanStorage())
|
|
* @param[in] level The current recursive level of the scan
|
|
*/
|
|
template <class T, bool isBackward, bool isExclusive, CUDPPOperator op>
|
|
void scanArrayRecursive(T *d_out,
|
|
const T *d_in,
|
|
T **d_blockSums,
|
|
size_t numElements,
|
|
size_t numRows,
|
|
const size_t *rowPitches,
|
|
int level)
|
|
{
|
|
unsigned int numBlocks =
|
|
max(1, (unsigned int)ceil((double)numElements / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
|
|
|
|
unsigned int sharedEltsPerBlock = CTA_SIZE * 2;
|
|
|
|
unsigned int sharedMemSize = sizeof(T) * sharedEltsPerBlock;
|
|
|
|
// divide pitch by four since scan's load/store addresses are for vec4 elements
|
|
unsigned int rowPitch = 1;
|
|
unsigned int blockSumRowPitch = 1;
|
|
|
|
if (numRows > 1)
|
|
{
|
|
rowPitch = rowPitches[level] / 4;
|
|
blockSumRowPitch = (numBlocks > 1) ? rowPitches[level+1] / 4 : 0;
|
|
}
|
|
|
|
bool fullBlock = (numElements == numBlocks * SCAN_ELTS_PER_THREAD * CTA_SIZE);
|
|
|
|
// setup execution parameters
|
|
dim3 grid(numBlocks, numRows, 1);
|
|
dim3 threads(CTA_SIZE, 1, 1);
|
|
|
|
// make sure there are no CUDA errors before we start
|
|
CUT_CHECK_ERROR("scanArray before kernels");
|
|
|
|
unsigned int traitsCode = 0;
|
|
if (numBlocks > 1) traitsCode |= 1;
|
|
if (numRows > 1) traitsCode |= 2;
|
|
if (fullBlock) traitsCode |= 4;
|
|
|
|
switch (traitsCode)
|
|
{
|
|
case 0: // single block, single row, non-full block
|
|
scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, false> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
case 1: // multiblock, single row, non-full block
|
|
scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, false> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
case 2: // single block, multirow, non-full block
|
|
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, false> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
case 3: // multiblock, multirow, non-full block
|
|
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, false> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
case 4: // single block, single row, full block
|
|
scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, true> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
case 5: // multiblock, single row, full block
|
|
scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, true> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
case 6: // single block, multirow, full block
|
|
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, true> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
case 7: // multiblock, multirow, full block
|
|
scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, true> >
|
|
<<< grid, threads, sharedMemSize >>>
|
|
(d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
|
|
break;
|
|
}
|
|
|
|
CUT_CHECK_ERROR("prescan");
|
|
|
|
if (numBlocks > 1)
|
|
{
|
|
// After scanning all the sub-blocks, we are mostly done. But
|
|
// now we need to take all of the last values of the
|
|
// sub-blocks and scan those. This will give us a new value
|
|
// that must be sdded to each block to get the final results.
|
|
|
|
scanArrayRecursive<T, isBackward, true, op>
|
|
((T*)d_blockSums[level], (const T*)d_blockSums[level],
|
|
(T**)d_blockSums, numBlocks, numRows, rowPitches, level + 1); // recursive (CPU) call
|
|
|
|
vectorAddUniform4<T, op, SCAN_ELTS_PER_THREAD>
|
|
<<< grid, threads >>>(d_out,
|
|
(T*)d_blockSums[level],
|
|
numElements,
|
|
rowPitch*4,
|
|
blockSumRowPitch*4,
|
|
0, 0);
|
|
CUT_CHECK_ERROR("vectorAddUniform");
|
|
}
|
|
}
|
|
|
|
// global
|
|
|
|
#ifdef __cplusplus
|
|
extern "C"
|
|
{
|
|
#endif
|
|
|
|
/** @brief Allocate intermediate arrays used by scan.
|
|
*
|
|
* Scans of large arrays must be split (possibly recursively) into a hierarchy
|
|
* of block scans, where each block is scanned by a single CUDA thread block.
|
|
* At each recursive level of the scan, we need an array in which to store the
|
|
* total sums of all blocks in that level. This function computes the amount
|
|
* of storage needed and allocates it.
|
|
*
|
|
* @param plan Pointer to CUDPPScanPlan object containing options and number
|
|
* of elements, which is used to compute storage requirements, and
|
|
* within which intermediate storage is allocated.
|
|
*/
|
|
void allocScanStorage(CUDPPScanPlan *plan)
|
|
{
|
|
//assert(config->_numEltsAllocated == 0); // shouldn't be called
|
|
|
|
plan->m_numEltsAllocated = plan->m_numElements;
|
|
|
|
size_t numElts = plan->m_numElements;
|
|
|
|
size_t level = 0;
|
|
|
|
do
|
|
{
|
|
size_t numBlocks =
|
|
max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
|
|
if (numBlocks > 1)
|
|
{
|
|
level++;
|
|
}
|
|
numElts = numBlocks;
|
|
} while (numElts > 1);
|
|
|
|
size_t elementSize = 0;
|
|
|
|
switch(plan->m_config.datatype)
|
|
{
|
|
case CUDPP_INT:
|
|
plan->m_blockSums = (void**) malloc(level * sizeof(int*));
|
|
elementSize = sizeof(int);
|
|
break;
|
|
case CUDPP_UINT:
|
|
plan->m_blockSums = (void**) malloc(level * sizeof(unsigned int*));
|
|
elementSize = sizeof(unsigned int);
|
|
break;
|
|
case CUDPP_FLOAT:
|
|
plan->m_blockSums = (void**) malloc(level * sizeof(float*));
|
|
elementSize = sizeof(float);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
plan->m_numLevelsAllocated = level;
|
|
numElts = plan->m_numElements;
|
|
size_t numRows = plan->m_numRows;
|
|
plan->m_numRowsAllocated = numRows;
|
|
plan->m_rowPitches = 0;
|
|
|
|
if (numRows > 1)
|
|
{
|
|
plan->m_rowPitches = (size_t*) malloc((level + 1) * sizeof(size_t));
|
|
plan->m_rowPitches[0] = plan->m_rowPitch;
|
|
}
|
|
|
|
level = 0;
|
|
|
|
do
|
|
{
|
|
size_t numBlocks =
|
|
max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
|
|
if (numBlocks > 1)
|
|
{
|
|
// Use cudaMallocPitch for multi-row block sums to ensure alignment
|
|
if (numRows > 1)
|
|
{
|
|
size_t dpitch;
|
|
CUDA_SAFE_CALL( cudaMallocPitch((void**) &(plan->m_blockSums[level]),
|
|
&dpitch,
|
|
numBlocks * elementSize,
|
|
numRows));
|
|
plan->m_rowPitches[level+1] = dpitch / elementSize;
|
|
level++;
|
|
}
|
|
else
|
|
{
|
|
CUDA_SAFE_CALL(cudaMalloc((void**) &(plan->m_blockSums[level++]),
|
|
numBlocks * elementSize));
|
|
}
|
|
}
|
|
numElts = numBlocks;
|
|
} while (numElts > 1);
|
|
|
|
CUT_CHECK_ERROR("allocScanStorage");
|
|
}
|
|
|
|
/** @brief Deallocate intermediate block sums arrays in a CUDPPScanPlan object.
|
|
*
|
|
* These arrays must have been allocated by allocScanStorage(), which is called
|
|
* by the constructor of cudppScanPlan().
|
|
*
|
|
* @param plan Pointer to CUDPPScanPlan object initialized by allocScanStorage().
|
|
*/
|
|
void freeScanStorage(CUDPPScanPlan *plan)
|
|
{
|
|
for (unsigned int i = 0; i < plan->m_numLevelsAllocated; i++)
|
|
{
|
|
cudaFree(plan->m_blockSums[i]);
|
|
}
|
|
|
|
CUT_CHECK_ERROR("freeScanStorage");
|
|
|
|
free((void**)plan->m_blockSums);
|
|
if (plan->m_numRows > 1)
|
|
free((void*)plan->m_rowPitches);
|
|
|
|
plan->m_blockSums = 0;
|
|
plan->m_numEltsAllocated = 0;
|
|
plan->m_numLevelsAllocated = 0;
|
|
}
|
|
|
|
|
|
/** @brief Dispatch function to perform a scan (prefix sum) on an
|
|
* array with the specified configuration.
|
|
*
|
|
* This is the dispatch routine which calls scanArrayRecursive() with
|
|
* appropriate template parameters and arguments to achieve the scan as
|
|
* specified in \a plan.
|
|
*
|
|
* @param[out] d_out The output array of scan results
|
|
* @param[in] d_in The input array
|
|
* @param[in] numElements The number of elements to scan
|
|
* @param[in] numRows The number of rows to scan in parallel
|
|
* @param[in] plan Pointer to CUDPPScanPlan object containing scan options
|
|
* and intermediate storage
|
|
*/
|
|
void cudppScanDispatch(void *d_out,
|
|
const void *d_in,
|
|
size_t numElements,
|
|
size_t numRows,
|
|
const CUDPPScanPlan *plan)
|
|
{
|
|
if (CUDPP_OPTION_EXCLUSIVE & plan->m_config.options)
|
|
{
|
|
if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
|
|
{
|
|
switch (plan->m_config.datatype)
|
|
{
|
|
case CUDPP_INT:
|
|
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<int, true, true, CUDPP_ADD>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<int, true, true, CUDPP_MULTIPLY>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<int, true, true, CUDPP_MAX>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<int, true, true, CUDPP_MIN>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_UINT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<unsigned int, true, true, CUDPP_ADD>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<unsigned int, true, true, CUDPP_MULTIPLY>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<unsigned int, true, true, CUDPP_MAX>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<unsigned int, true, true, CUDPP_MIN>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_FLOAT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<float, true, true, CUDPP_ADD>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<float, true, true, CUDPP_MULTIPLY>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<float, true, true, CUDPP_MAX>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<float, true, true, CUDPP_MIN>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
switch (plan->m_config.datatype)
|
|
{
|
|
case CUDPP_INT:
|
|
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<int, false, true, CUDPP_ADD>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<int, false, true, CUDPP_MULTIPLY>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<int, false, true, CUDPP_MAX>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<int, false, true, CUDPP_MIN>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_UINT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<unsigned int, false, true, CUDPP_ADD>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<unsigned int, false, true, CUDPP_MULTIPLY>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<unsigned int, false, true, CUDPP_MAX>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<unsigned int, false, true, CUDPP_MIN>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_FLOAT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<float, false, true, CUDPP_ADD>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<float, false, true, CUDPP_MULTIPLY>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<float, false, true, CUDPP_MAX>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<float, false, true, CUDPP_MIN>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
|
|
{
|
|
switch (plan->m_config.datatype)
|
|
{
|
|
case CUDPP_INT:
|
|
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<int, true, false, CUDPP_ADD>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<int, true, false, CUDPP_MULTIPLY>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<int, true, false, CUDPP_MAX>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<int, true, false, CUDPP_MIN>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_UINT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<unsigned int, true, false, CUDPP_ADD>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<unsigned int, true, false, CUDPP_MULTIPLY>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<unsigned int, true, false, CUDPP_MAX>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<unsigned int, true, false, CUDPP_MIN>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_FLOAT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<float, true, false, CUDPP_ADD>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<float, true, false, CUDPP_MULTIPLY>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<float, true, false, CUDPP_MAX>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<float, true, false, CUDPP_MIN>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
switch (plan->m_config.datatype)
|
|
{
|
|
case CUDPP_INT:
|
|
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<int, false, false, CUDPP_ADD>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<int, false, false, CUDPP_MULTIPLY>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<int, false, false, CUDPP_MAX>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<int, false, false, CUDPP_MIN>
|
|
((int*)d_out, (const int*)d_in,
|
|
(int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_UINT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<unsigned int, false, false, CUDPP_ADD>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<unsigned int, false, false, CUDPP_MULTIPLY>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<unsigned int, false, false, CUDPP_MAX>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<unsigned int, false, false, CUDPP_MIN>
|
|
((unsigned int*)d_out, (const unsigned int*)d_in,
|
|
(unsigned int**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case CUDPP_FLOAT:
|
|
switch(plan->m_config.op)
|
|
{
|
|
case CUDPP_ADD:
|
|
scanArrayRecursive<float, false, false, CUDPP_ADD>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MULTIPLY:
|
|
scanArrayRecursive<float, false, false, CUDPP_MULTIPLY>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MAX:
|
|
scanArrayRecursive<float, false, false, CUDPP_MAX>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
case CUDPP_MIN:
|
|
scanArrayRecursive<float, false, false, CUDPP_MIN>
|
|
((float*)d_out, (const float*)d_in,
|
|
(float**)plan->m_blockSums,
|
|
numElements, numRows, plan->m_rowPitches, 0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
/** @} */ // end scan functions
|
|
/** @} */ // end cudpp_app
|