forked from lijiext/lammps
470 lines
17 KiB
Plaintext
470 lines
17 KiB
Plaintext
// -------------------------------------------------------------
|
|
// CUDPP -- CUDA Data Parallel Primitives library
|
|
// -------------------------------------------------------------
|
|
// $Revision: 5632 $
|
|
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
|
|
// -------------------------------------------------------------
|
|
// This source code is distributed under the terms of license.txt in
|
|
// the root directory of this source distribution.
|
|
// -------------------------------------------------------------
|
|
|
|
/**
|
|
* @file
|
|
* vector_kernel.cu
|
|
*
|
|
* @brief CUDA kernel methods for basic operations on vectors.
|
|
*
|
|
* CUDA kernel methods for basic operations on vectors.
|
|
*
|
|
* Examples:
|
|
* - vectorAddConstant(): d_vector + constant
|
|
* - vectorAddUniform(): d_vector + uniform (per-block constants)
|
|
* - vectorAddVectorVector(): d_vector + d_vector
|
|
*/
|
|
|
|
// MJH: these functions assume there are 2N elements for N threads.
|
|
// Is this always going to be a good idea? There may be cases where
|
|
// we have as many threads as elements, but for large problems
|
|
// we are probably limited by max CTA size for simple kernels like
|
|
// this so we should process multiple elements per thread.
|
|
// we may want to extend these with looping versions that process
|
|
// many elements per thread.
|
|
|
|
#include "cudpp_util.h"
|
|
#include "sharedmem.h"
|
|
#include "cudpp.h"
|
|
|
|
/** \addtogroup cudpp_kernel
|
|
* @{
|
|
*/
|
|
|
|
/** @name Vector Functions
|
|
* CUDA kernel methods for basic operations on vectors.
|
|
* @{
|
|
*/
|
|
|
|
/** @brief Adds a constant value to all values in the input d_vector
|
|
*
|
|
* Each thread adds two pairs of elements.
|
|
* @todo Test this function -- it is currently not yet used.
|
|
*
|
|
* @param[in,out] d_vector The array of elements to be modified
|
|
* @param[in] constant The constant value to be added to elements of
|
|
* \a d_vector
|
|
* @param[in] n The number of elements in the d_vector to be modified
|
|
* @param[in] baseIndex An optional offset to the beginning of the
|
|
* elements in the input array to be processed
|
|
*/
|
|
template <class T>
|
|
__global__ void vectorAddConstant(T *d_vector,
|
|
T constant,
|
|
int n,
|
|
int baseIndex)
|
|
{
|
|
// Compute this thread's output address
|
|
unsigned int address = baseIndex + threadIdx.x +
|
|
__mul24(blockIdx.x, (blockDim.x << 1));
|
|
|
|
// note two adds per thread: one in first half of the block, one in last
|
|
d_vector[address] += constant;
|
|
d_vector[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * constant;
|
|
}
|
|
|
|
/** @brief Add a uniform value to each data element of an array
|
|
*
|
|
* This function reads one value per CTA from \a d_uniforms into shared
|
|
* memory and adds that value to all values "owned" by the CTA in \a
|
|
* d_vector. Each thread adds two pairs of values.
|
|
*
|
|
* @param[out] d_vector The d_vector whose values will have the uniform added
|
|
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
|
* @param[in] numElements The number of elements in \a d_vector to process
|
|
* @param[in] blockOffset an optional offset to the beginning of this block's
|
|
* data.
|
|
* @param[in] baseIndex an optional offset to the beginning of the array
|
|
* within \a d_vector.
|
|
*/
|
|
template <class T>
|
|
__global__ void vectorAddUniform(T *d_vector,
|
|
const T *d_uniforms,
|
|
int numElements,
|
|
int blockOffset,
|
|
int baseIndex)
|
|
{
|
|
__shared__ T uni;
|
|
// Get this block's uniform value from the uniform array in device memory
|
|
// We store it in shared memory so that the hardware's shared memory
|
|
// broadcast capability can be used to share among all threads in each warp
|
|
// in a single cycle
|
|
if (threadIdx.x == 0)
|
|
{
|
|
uni = d_uniforms[blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset];
|
|
}
|
|
|
|
// Compute this thread's output address
|
|
int width = __mul24(gridDim.x,(blockDim.x << 1));
|
|
|
|
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
|
|
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 1));
|
|
|
|
__syncthreads();
|
|
|
|
// note two adds per thread: one in first half of the block, one in last
|
|
d_vector[address] += uni;
|
|
if (threadIdx.x + blockDim.x < numElements) d_vector[address + blockDim.x] += uni;
|
|
}
|
|
|
|
|
|
/** @brief Add a uniform value to each data element of an array (vec4 version)
|
|
*
|
|
* This function reads one value per CTA from \a d_uniforms into shared
|
|
* memory and adds that value to all values "owned" by the CTA in \a d_vector.
|
|
* Each thread adds the uniform value to eight values in \a d_vector.
|
|
*
|
|
* @param[out] d_vector The d_vector whose values will have the uniform added
|
|
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
|
* @param[in] numElements The number of elements in \a d_vector to process
|
|
* @param[in] vectorRowPitch For 2D arrays, the pitch (in elements) of the
|
|
* rows of \a d_vector.
|
|
* @param[in] uniformRowPitch For 2D arrays, the pitch (in elements) of the
|
|
* rows of \a d_uniforms.
|
|
* @param[in] blockOffset an optional offset to the beginning of this block's
|
|
* data.
|
|
* @param[in] baseIndex an optional offset to the beginning of the array
|
|
* within \a d_vector.
|
|
*/
|
|
template <class T, CUDPPOperator op, int elementsPerThread>
|
|
__global__ void vectorAddUniform4(T *d_vector,
|
|
const T *d_uniforms,
|
|
int numElements,
|
|
int vectorRowPitch, // width of input array in elements
|
|
int uniformRowPitch, // width of uniform array in elements
|
|
int blockOffset,
|
|
int baseIndex)
|
|
{
|
|
__shared__ T uni;
|
|
// Get this block's uniform value from the uniform array in device memory
|
|
// We store it in shared memory so that the hardware's shared memory
|
|
// broadcast capability can be used to share among all threads in each warp
|
|
// in a single cycle
|
|
if (threadIdx.x == 0)
|
|
{
|
|
uni = d_uniforms[blockIdx.x + __umul24(uniformRowPitch, blockIdx.y) + blockOffset];
|
|
}
|
|
|
|
// Compute this thread's output address
|
|
//int width = __mul24(gridDim.x,(blockDim.x << 1));
|
|
|
|
unsigned int address = baseIndex + __umul24(vectorRowPitch, blockIdx.y)
|
|
+ threadIdx.x + __umul24(blockIdx.x, (blockDim.x * elementsPerThread));
|
|
numElements += __umul24(vectorRowPitch, blockIdx.y);
|
|
|
|
__syncthreads();
|
|
|
|
switch (op)
|
|
{
|
|
case CUDPP_ADD:
|
|
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
|
{
|
|
d_vector[address] += uni;
|
|
address += blockDim.x;
|
|
}
|
|
break;
|
|
|
|
case CUDPP_MULTIPLY:
|
|
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
|
{
|
|
d_vector[address] *= uni;
|
|
address += blockDim.x;
|
|
}
|
|
break;
|
|
|
|
case CUDPP_MAX:
|
|
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
|
{
|
|
d_vector[address] = max(d_vector[address], uni);
|
|
address += blockDim.x;
|
|
}
|
|
break;
|
|
|
|
case CUDPP_MIN:
|
|
for (int i = 0; i < elementsPerThread && address < numElements; i++)
|
|
{
|
|
d_vector[address] = min(d_vector[address], uni);
|
|
address += blockDim.x;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/** @brief Adds together two vectors
|
|
*
|
|
* Each thread adds two pairs of elements.
|
|
* @todo Test this function -- it is currently not yet used.
|
|
*
|
|
* @param[out] d_vectorA The left operand array and the result
|
|
* @param[in] d_vectorB The right operand array
|
|
* @param[in] numElements The number of elements in the vectors to be added.
|
|
* @param[in] baseIndex An optional offset to the beginning of the
|
|
* elements in the input arrays to be processed
|
|
*/
|
|
template <class T>
|
|
__global__ void vectorAddVector(T *d_vectorA, // A += B
|
|
const T *d_vectorB,
|
|
int numElements,
|
|
int baseIndex)
|
|
{
|
|
// Compute this thread's output address
|
|
unsigned int address = baseIndex + threadIdx.x +
|
|
__mul24(blockIdx.x, (blockDim.x << 1));
|
|
|
|
// note two adds per thread: one in first half of the block, one in last
|
|
d_vectorA[address] += d_vectorB[address];
|
|
d_vectorA[address + blockDim.x] +=
|
|
(threadIdx.x + blockDim.x < numElements) * d_vectorB[address];
|
|
}
|
|
|
|
/** @brief Add a uniform value to data elements of an array (vec4 version)
|
|
*
|
|
* This function reads one value per CTA from \a d_uniforms into shared
|
|
* memory and adds that value to values "owned" by the CTA in \a d_vector.
|
|
* The uniform value is added to only those values "owned" by the CTA which
|
|
* have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX
|
|
* it adds the uniform to all values "owned" by the CTA.
|
|
* Each thread adds the uniform value to eight values in \a d_vector.
|
|
*
|
|
* @param[out] d_vector The d_vector whose values will have the uniform added
|
|
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
|
* @param[in] d_maxIndices The array of maximum indices (one per CTA). This is
|
|
* index upto which the uniform would be added. If this is UINT_MAX
|
|
* the uniform is added to all elements of the CTA. This index is
|
|
* 1-based.
|
|
* @param[in] numElements The number of elements in \a d_vector to process
|
|
* @param[in] blockOffset an optional offset to the beginning of this block's
|
|
* data.
|
|
* @param[in] baseIndex an optional offset to the beginning of the array
|
|
* within \a d_vector.
|
|
*/
|
|
template <class T, CUDPPOperator oper, bool isLastBlockFull>
|
|
__global__ void vectorSegmentedAddUniform4(T *d_vector,
|
|
const T *d_uniforms,
|
|
const unsigned int *d_maxIndices,
|
|
unsigned int numElements,
|
|
int blockOffset,
|
|
int baseIndex)
|
|
{
|
|
__shared__ T uni[2];
|
|
|
|
unsigned int blockAddress =
|
|
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
|
|
|
|
// Get this block's uniform value from the uniform array in device memory
|
|
// We store it in shared memory so that the hardware's shared memory
|
|
// broadcast capability can be used to share among all threads in each warp
|
|
// in a single cycle
|
|
|
|
if (threadIdx.x == 0)
|
|
{
|
|
if (blockAddress > 0)
|
|
uni[0] = d_uniforms[blockAddress-1];
|
|
else
|
|
uni[0] = Operator<T, oper>::identity();
|
|
|
|
// Tacit assumption that T is four-byte wide
|
|
uni[1] = (T)(d_maxIndices[blockAddress]);
|
|
}
|
|
|
|
// Compute this thread's output address
|
|
int width = __mul24(gridDim.x,(blockDim.x << 1));
|
|
|
|
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
|
|
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
|
|
|
|
__syncthreads();
|
|
|
|
unsigned int maxIndex = (unsigned int)(uni[1]);
|
|
|
|
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
|
|
|
|
if (maxIndex < UINT_MAX)
|
|
{
|
|
// Since maxIndex is a 1 based index
|
|
--maxIndex;
|
|
bool leftLess = address < maxIndex;
|
|
bool rightLess = (address + 7 * blockDim.x) < maxIndex;
|
|
|
|
if (leftLess)
|
|
{
|
|
if (rightLess)
|
|
{
|
|
for (unsigned int i = 0; i < 8; ++i)
|
|
d_vector[address + i * blockDim.x] =
|
|
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
|
|
}
|
|
else
|
|
{
|
|
for (unsigned int i=0; i < 8; ++i)
|
|
{
|
|
if (address < maxIndex)
|
|
d_vector[address] =
|
|
Operator<T, oper>::op(d_vector[address], uni[0]);
|
|
|
|
address += blockDim.x;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!isLastBlockFull && isLastBlock)
|
|
{
|
|
for (unsigned int i = 0; i < 8; ++i)
|
|
{
|
|
if (address < numElements)
|
|
d_vector[address] =
|
|
Operator<T, oper>::op(d_vector[address], uni[0]);
|
|
|
|
address += blockDim.x;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (unsigned int i=0; i<8; ++i)
|
|
{
|
|
d_vector[address] =
|
|
Operator<T, oper>::op(d_vector[address], uni[0]);
|
|
|
|
address += blockDim.x;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/** @brief Add a uniform value to data elements of an array (vec4 version)
|
|
*
|
|
* This function reads one value per CTA from \a d_uniforms into shared
|
|
* memory and adds that value to values "owned" by the CTA in \a d_vector.
|
|
* The uniform value is added to only those values "owned" by the CTA which
|
|
* have an index greater than d_minIndex. If d_minIndex for that CTA is 0
|
|
* it adds the uniform to all values "owned" by the CTA.
|
|
* Each thread adds the uniform value to eight values in \a d_vector.
|
|
*
|
|
* @param[out] d_vector The d_vector whose values will have the uniform added
|
|
* @param[in] d_uniforms The array of uniform values (one per CTA)
|
|
* @param[in] d_minIndices The array of minimum indices (one per CTA). The
|
|
* uniform is added to the right of this index (that is, to every index
|
|
* that is greater than this index). If this is 0, the uniform is
|
|
* added to all elements of the CTA. This index is 1-based to
|
|
* prevent overloading of what 0 means. In our case it means
|
|
* absence of a flag. But if the first element of a CTA has
|
|
* flag the index will also be 0. Hence we use 1-based indices
|
|
* so the index is 1 in the latter case.
|
|
* @param[in] numElements The number of elements in \a d_vector to process
|
|
* @param[in] blockOffset an optional offset to the beginning of this block's
|
|
* data.
|
|
* @param[in] baseIndex an optional offset to the beginning of the array
|
|
* within \a d_vector.
|
|
*
|
|
*/
|
|
template <class T, CUDPPOperator oper, bool isLastBlockFull>
|
|
__global__ void vectorSegmentedAddUniformToRight4(T *d_vector,
|
|
const T *d_uniforms,
|
|
const unsigned int *d_minIndices,
|
|
unsigned int numElements,
|
|
int blockOffset,
|
|
int baseIndex)
|
|
{
|
|
__shared__ T uni[2];
|
|
|
|
unsigned int blockAddress =
|
|
blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
|
|
|
|
// Get this block's uniform value from the uniform array in device memory
|
|
// We store it in shared memory so that the hardware's shared memory
|
|
// broadcast capability can be used to share among all threads in each warp
|
|
// in a single cycle
|
|
|
|
if (threadIdx.x == 0)
|
|
{
|
|
// FIXME - blockAddress test here is incompatible with how it is calculated
|
|
// above
|
|
if (blockAddress < (gridDim.x-1))
|
|
uni[0] = d_uniforms[blockAddress+1];
|
|
else
|
|
uni[0] = Operator<T, oper>::identity();
|
|
|
|
// Tacit assumption that T is four-byte wide
|
|
uni[1] = (T)(d_minIndices[blockAddress]);
|
|
}
|
|
|
|
// Compute this thread's output address
|
|
int width = __mul24(gridDim.x,(blockDim.x << 1));
|
|
|
|
unsigned int address = baseIndex + __mul24(width, blockIdx.y)
|
|
+ threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3));
|
|
|
|
__syncthreads();
|
|
|
|
unsigned int minIndex = (unsigned int)(uni[1]);
|
|
|
|
bool isLastBlock = (blockIdx.x == (gridDim.x-1));
|
|
|
|
if (minIndex > 0)
|
|
{
|
|
// Since minIndex is a 1 based index
|
|
--minIndex;
|
|
bool leftInRange = address > minIndex;
|
|
bool rightInRange = (address + 7 * blockDim.x) > minIndex;
|
|
|
|
if (rightInRange)
|
|
{
|
|
if (leftInRange)
|
|
{
|
|
for (unsigned int i = 0; i < 8; ++i)
|
|
d_vector[address + i * blockDim.x] =
|
|
Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
|
|
}
|
|
else
|
|
{
|
|
for (unsigned int i=0; i < 8; ++i)
|
|
{
|
|
if (address > minIndex)
|
|
d_vector[address] =
|
|
Operator<T, oper>::op(d_vector[address], uni[0]);
|
|
|
|
address += blockDim.x;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!isLastBlockFull && isLastBlock)
|
|
{
|
|
for (unsigned int i = 0; i < 8; ++i)
|
|
{
|
|
if (address < numElements)
|
|
d_vector[address] =
|
|
Operator<T, oper>::op(d_vector[address], uni[0]);
|
|
|
|
address += blockDim.x;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (unsigned int i=0; i<8; ++i)
|
|
{
|
|
d_vector[address] =
|
|
Operator<T, oper>::op(d_vector[address], uni[0]);
|
|
|
|
address += blockDim.x;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/** @} */ // end d_vector functions
|
|
/** @} */ // end cudpp_kernel
|