lammps/lib/gpu/cudpp_mini/cudpp_plan.h

// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 3572$
// $Date: 2010-11-23 13:04:43 -0700 (Tue, 23 Nov 2010) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
#ifndef __CUDPP_PLAN_H__
#define __CUDPP_PLAN_H__

typedef void* KernelPointer;

extern "C" size_t getNumCTAs(KernelPointer kernel);
extern "C" void   compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock);

template <typename T>
size_t numCTAs(T kernel)
{
    return getNumCTAs((KernelPointer)kernel);
}

template <typename T>
void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threadsPerBlock)
{
    compNumCTAs((KernelPointer)kernel, bytesDynamicSharedMem, threadsPerBlock);
}

/** @brief Base class for CUDPP Plan data structures
  *
  * CUDPPPlan and its subclasses provide the internal (i.e. not visible to the
  * library user) infrastructure for planning algorithm execution.  They
  * own intermediate storage for CUDPP algorithms as well as, in some cases,
  * information about optimal execution configuration for the present hardware.
  *
  */
class CUDPPPlan
{
public:
    CUDPPPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
    virtual ~CUDPPPlan() {}

    // Note anything passed to functions compiled by NVCC must be public
    CUDPPConfiguration m_config;        //!< @internal Options structure
    size_t             m_numElements;   //!< @internal Maximum number of input elements
    size_t             m_numRows;       //!< @internal Maximum number of input rows
    size_t             m_rowPitch;      //!< @internal Pitch of input rows in elements
};

/** @brief Plan class for scan algorithm
  *
  */
class CUDPPScanPlan : public CUDPPPlan
{
public:
    CUDPPScanPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
    virtual ~CUDPPScanPlan();

    void  **m_blockSums;          //!< @internal Intermediate block sums array
    size_t *m_rowPitches;         //!< @internal Pitch of each row in elements (for cudppMultiScan())
    size_t  m_numEltsAllocated;   //!< @internal Number of elements allocated (maximum scan size)
    size_t  m_numRowsAllocated;   //!< @internal Number of rows allocated (for cudppMultiScan())
    size_t  m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
};

/** @brief Plan class for segmented scan algorithm
*
*/
class CUDPPSegmentedScanPlan : public CUDPPPlan
{
public:
    CUDPPSegmentedScanPlan(CUDPPConfiguration config, size_t numElements);
    virtual ~CUDPPSegmentedScanPlan();

    void          **m_blockSums;          //!< @internal Intermediate block sums array
    unsigned int  **m_blockFlags;         //!< @internal Intermediate block flags array
    unsigned int  **m_blockIndices;       //!< @internal Intermediate block indices array
    size_t        m_numEltsAllocated;     //!< @internal Number of elements allocated (maximum scan size)
    size_t        m_numLevelsAllocated;   //!< @internal Number of levels allocaed (in _scanBlockSums)
};

/** @brief Plan class for compact algorithm
*
*/
class CUDPPCompactPlan : public CUDPPPlan
{
public:
    CUDPPCompactPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
    virtual ~CUDPPCompactPlan();

    CUDPPScanPlan *m_scanPlan;         //!< @internal Compact performs a scan of type unsigned int using this plan
    unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan

};

class CUDPPRadixSortPlan : public CUDPPPlan
{
public:
    CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements);
    virtual ~CUDPPRadixSortPlan();

    bool           m_bKeysOnly;
    bool           m_bManualCoalesce;
    bool           m_bUsePersistentCTAs;
    unsigned int   m_persistentCTAThreshold[2];
    unsigned int   m_persistentCTAThresholdFullBlocks[2];
    CUDPPScanPlan *m_scanPlan;        //!< @internal Sort performs a scan of type unsigned int using this plan
    unsigned int   m_keyBits;
    mutable void  *m_tempKeys;        //!< @internal Intermediate storage for keys
    mutable void  *m_tempValues;      //!< @internal Intermediate storage for values
    unsigned int  *m_counters;        //!< @internal Counter for each radix
    unsigned int  *m_countersSum;     //!< @internal Prefix sum of radix counters
    unsigned int  *m_blockOffsets;    //!< @internal Global offsets of each radix in each block

};

/** @brief Plan class for sparse-matrix dense-vector multiply
*
*/
class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan
{
public:
    CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts,
                                        const void         *A,
                                        const unsigned int *rowindx,
                                        const unsigned int *indx, size_t numRows);
    virtual ~CUDPPSparseMatrixVectorMultiplyPlan();

    CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan
    void             *m_d_prod;  //!< @internal Vector of products (of an element in A and its corresponding (thats is
                                 //!            belongs to the same row) element in x; this is the input and output of
                                 //!            segmented scan
    unsigned int     *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element
                                 //!            of its row; this is the flags vector for segmented scan
    unsigned int     *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
                                         //!            which is the last element of that row. Resides in GPU memory.
    unsigned int     *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
                                    //!            which is the first element of that row. Resides in GPU memory.
    unsigned int     *m_d_index;    //!<@internal Vector of column numbers one for each element in A
    void             *m_d_A;        //!<@internal The A matrix
    unsigned int     *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
                                       //!            which is the last element of that row. Resides in CPU memory.
    size_t           m_numRows; //!< Number of rows
    size_t           m_numNonZeroElements; //!<Number of non-zero elements
};

/** @brief Plan class for random number generator
*
*/
class CUDPPRandPlan : public CUDPPPlan
{
public:
    CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements);

    unsigned int m_seed; //!< @internal the seed for the random number generator
};
#endif // __CUDPP_PLAN_H__