lammps/lib/gpu/cudpp_mini/cudpp.cpp

418 lines
16 KiB
C++

// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5632 $
// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* cudpp.cpp
*
* @brief Main library source file. Implements wrappers for public
* interface.
*
* Main library source file. Implements wrappers for public
* interface. These wrappers call application-level operators.
* As this grows we may decide to partition into multiple source
* files.
*/
/**
* \defgroup publicInterface CUDPP Public Interface
* The CUDA public interface comprises the functions, structs, and enums
* defined in cudpp.h. Public interface functions call functions in the
* \link cudpp_app Application-Level\endlink interface. The public
* interface functions include Plan Interface functions and Algorithm
* Interface functions. Plan Inteface functions are used for creating
* CUDPP Plan objects which contain configuration details, intermediate
* storage space, and in the case of cudppSparseMatrix(), data. The
* Algorithm Interface is the set of functions that do the real work
* of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply.
*
* @{
*/
/** @name Algorithm Interface
* @{
*/
#include "cudpp.h"
#include "cudpp_plan_manager.h"
#include "cudpp_scan.h"
//#include "cudpp_segscan.h"
//#include "cudpp_compact.h"
//#include "cudpp_spmvmult.h"
#include "cudpp_radixsort.h"
//#include "cudpp_rand.h"
/**
* @brief Performs a scan operation of numElements on its input in
* GPU memory (d_in) and places the output in GPU memory
* (d_out), with the scan parameters specified in the plan pointed to by
* planHandle.
* The input to a scan operation is an input array, a binary associative
* operator (like + or max), and an identity element for that operator
* (+'s identity is 0). The output of scan is the same size as its input.
* Informally, the output at each element is the result of operator
* applied to each input that comes before it. For instance, the
* output of sum-scan at each element is the sum of all the input
* elements before that input.
*
* More formally, for associative operator
* @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly,
* <var>out<sub>i</sub></var> = <var>in<sub>0</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>1</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>i-1</sub></var>.
*
* CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator,
* an exclusive scan computes the sum of all input elements before the
* current element, while an inclusive scan computes the sum of all input
* elements up to and including the current element.
*
* Before calling scan, create an internal plan using cudppPlan().
*
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
*
* @param[in] planHandle Handle to plan for this scan
* @param[out] d_out output of scan, in GPU memory
* @param[in] d_in input to scan, in GPU memory
* @param[in] numElements number of elements to scan
*
* @see cudppPlan, cudppDestroyPlan
*/
CUDPP_DLL
CUDPPResult cudppScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements)
{
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppScanDispatch(d_out, d_in, numElements, 1, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
/**
* @brief Performs a segmented scan operation of numElements on its input in
* GPU memory (d_idata) and places the output in GPU memory
* (d_out), with the scan parameters specified in the plan pointed to by
* planHandle.
* The input to a segmented scan operation is an input array of data,
* an input array of flags which demarcate segments, a binary associative
* operator (like + or max), and an identity element for that operator
* (+'s identity is 0). The array of flags is the same length as the input
* with 1 marking the the first element of a segment and 0 otherwise. The
* output of segmented scan is the same size as its input. Informally, the
* output at each element is the result of operator applied to each input
* that comes before it in that segment. For instance, the output of
* segmented sum-scan at each element is the sum of all the input elements
* before that input in that segment.
*
* More formally, for associative operator
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly,
* <var>out<sub>i</sub></var> = <var>in<sub>k</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>k+1</sub></var>
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
* @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
* <var>in<sub>i-1</sub></var>.
* <i>k</i> is the index of the first element of the segment in which <i>i</i> lies
*
* We support both "exclusive" and "inclusive" variants. For a segmented sum-scan,
* the exclusive variant computes the sum of all input elements before the
* current element in that segment, while the inclusive variant computes the
* sum of all input elements up to and including the current element, in
* that segment.
*
* Before calling segmented scan, create an internal plan using cudppPlan().
*
* After you are finished with the scan plan, clean up with cudppDestroyPlan().
* @param[in] planHandle Handle to plan for this scan
* @param[out] d_out output of segmented scan, in GPU memory
* @param[in] d_idata input data to segmented scan, in GPU memory
* @param[in] d_iflags input flags to segmented scan, in GPU memory
* @param[in] numElements number of elements to perform segmented scan on
*
* @see cudppPlan, cudppDestroyPlan
CUDPP_DLL
CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle,
void *d_out,
const void *d_idata,
const unsigned int *d_iflags,
size_t numElements)
{
CUDPPSegmentedScanPlan *plan =
(CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
*/
/**
* @brief Performs numRows parallel scan operations of numElements
* each on its input (d_in) and places the output in d_out,
* with the scan parameters set by config. Exactly like cudppScan
* except that it runs on multiple rows in parallel.
*
* Note that to achieve good performance with cudppMultiScan one should
* allocate the device arrays passed to it so that all rows are aligned
* to the correct boundaries for the architecture the app is running on.
* The easy way to do this is to use cudaMallocPitch() to allocate a
* 2D array on the device. Use the \a rowPitch parameter to cudppPlan()
* to specify this pitch. The easiest way is to pass the device pitch
* returned by cudaMallocPitch to cudppPlan() via \a rowPitch.
*
* @param[in] planHandle handle to CUDPPScanPlan
* @param[out] d_out output of scan, in GPU memory
* @param[in] d_in input to scan, in GPU memory
* @param[in] numElements number of elements (per row) to scan
* @param[in] numRows number of rows to scan in parallel
*
* @see cudppScan, cudppPlan
CUDPP_DLL
CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
void *d_out,
const void *d_in,
size_t numElements,
size_t numRows)
{
CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppScanDispatch(d_out, d_in, numElements, numRows, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
}
*/
/**
* @brief Given an array \a d_in and an array of 1/0 flags in \a
* deviceValid, returns a compacted array in \a d_out of corresponding
* only the "valid" values from \a d_in.
*
* Takes as input an array of elements in GPU memory
* (\a d_in) and an equal-sized unsigned int array in GPU memory
* (\a deviceValid) that indicate which of those input elements are
* valid. The output is a packed array, in GPU memory, of only those
* elements marked as valid.
*
* Internally, uses cudppScan.
*
* Example:
* \code
* d_in = [ a b c d e f ]
* deviceValid = [ 1 0 1 1 0 1 ]
* d_out = [ a c d f ]
* \endcode
*
* @todo [MJH] We need to evaluate whether cudppCompact should be a core member
* of the public interface. It's not clear to me that what the user always
* wants is a final compacted array. Often one just wants the array of indices
* to which each input element should go in the output. The split() routine used
* in radix sort might make more sense to expose.
*
* @param[in] planHandle handle to CUDPPCompactPlan
* @param[out] d_out compacted output
* @param[out] d_numValidElements set during cudppCompact; is set with the
* number of elements valid flags in the d_isValid input array
* @param[in] d_in input to compact
* @param[in] d_isValid which elements in d_in are valid
* @param[in] numElements number of elements in d_in
CUDPP_DLL
CUDPPResult cudppCompact(CUDPPHandle planHandle,
void *d_out,
size_t *d_numValidElements,
const void *d_in,
const unsigned int *d_isValid,
size_t numElements)
{
CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid,
numElements, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
*/
/**
* @brief Sorts key-value pairs or keys only
*
* Takes as input an array of keys in GPU memory
* (d_keys) and an optional array of corresponding values,
* and outputs sorted arrays of keys and (optionally) values in place.
* Key-value and key-only sort is selected through the configuration of
* the plan, using the options CUDPP_OPTION_KEYS_ONLY and
* CUDPP_OPTION_KEY_VALUE_PAIRS.
*
* Supported key types are CUDPP_FLOAT and CUDPP_UINT. Values can be
* any 32-bit type (internally, values are treated only as a payload
* and cast to unsigned int).
*
* @todo Determine if we need to provide an "out of place" sort interface.
*
* @param[in] planHandle handle to CUDPPSortPlan
* @param[out] d_keys keys by which key-value pairs will be sorted
* @param[in] d_values values to be sorted
* @param[in] keyBits the number of least significant bits in each element
* of d_keys to sort by
* @param[in] numElements number of elements in d_keys and d_values
*
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
*/
CUDPP_DLL
CUDPPResult cudppSort(CUDPPHandle planHandle,
void *d_keys,
void *d_values,
int keyBits,
size_t numElements)
{
CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle);
if (plan != NULL)
{
cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
/** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x
*
* Given a matrix object handle (which has been initialized using cudppSparseMatrix()),
* This function multiplies the input vector \a d_x by the matrix referred to by
* \a sparseMatrixHandle, returning the result in \a d_y.
*
* @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix()
* @param d_y The output vector, y
* @param d_x The input vector, x
*
* @see cudppSparseMatrix, cudppDestroySparseMatrix
CUDPP_DLL
CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
void *d_y,
const void *d_x)
{
CUDPPSparseMatrixVectorMultiplyPlan *plan =
(CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle);
if (plan != NULL)
{
cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan);
return CUDPP_SUCCESS;
}
else
{
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
}
}
*/
/**
* @brief Rand puts \a numElements random 32-bit elements into \a d_out
*
* Outputs \a numElements random values to \a d_out. \a d_out must be of
* type unsigned int, allocated in device memory.
*
* The algorithm used for the random number generation is stored in \a planHandle.
* Depending on the specification of the pseudo random number generator(PRNG),
* the generator may have one or more seeds. To set the seed, use cudppRandSeed().
*
* @todo Currently only MD5 PRNG is supported. We may provide more rand routines in
* the future.
*
* @param[in] planHandle Handle to plan for rand
* @param[in] numElements number of elements in d_out.
* @param[out] d_out output of rand, in GPU memory. Should be an array of unsigned integers.
*
* @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
CUDPP_DLL
CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements)
{
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
if(plan != NULL)
{
//dispatch the rand algorithm here
cudppRandDispatch(d_out, numElements, plan);
return CUDPP_SUCCESS;
}
else
return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
}
*/
/**@brief Sets the seed used for rand
*
* The seed is crucial to any random number generator as it allows a
* sequence of random numbers to be replicated. Since there may be
* multiple different rand algorithms in CUDPP, cudppRandSeed
* uses \a planHandle to determine which seed to set. Each rand
* algorithm has its own unique set of seeds depending on what
* the algorithm needs.
*
* @param[in] planHandle the handle to the plan which specifies which rand seed to set
* @param[in] seed the value which the internal cudpp seed will be set to
CUDPP_DLL
CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed)
{
CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
//switch on the plan to figure out which seed to update
switch(plan->m_config.algorithm)
{
case CUDPP_RAND_MD5:
plan->m_seed = seed;
break;
default:
break;
}
return CUDPP_SUCCESS;
}//end cudppRandSeed
*/
/** @} */ // end Algorithm Interface
/** @} */ // end of publicInterface group
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End: