lammps/lib/gpu/cudpp_mini/radixsort_app.cu

994 lines
38 KiB
Plaintext

// -------------------------------------------------------------
// CUDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* radixsort_app.cu
*
* @brief CUDPP application-level radix sorting routines
*/
/** @addtogroup cudpp_app
* @{
*/
/** @name RadixSort Functions
* @{
*/
#include "cudpp.h"
#include "cudpp_util.h"
#include "cudpp_radixsort.h"
#include "cudpp_scan.h"
#include "kernel/radixsort_kernel.cu"
#include <cutil.h>
#include <cstdlib>
#include <cstdio>
#include <assert.h>
typedef unsigned int uint;
/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
* starting at startbit.
*
* Uses cudppScanDispatch() for the prefix sum of radix counters.
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
**/
template<uint nbits, uint startbit, bool flip, bool unflip>
void radixSortStep(uint *keys,
uint *values,
const CUDPPRadixSortPlan *plan,
uint numElements)
{
const uint eltsPerBlock = SORT_CTA_SIZE * 4;
const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
bool fullBlocks = ((numElements % eltsPerBlock) == 0);
uint numBlocks = (fullBlocks) ?
(numElements / eltsPerBlock) :
(numElements / eltsPerBlock + 1);
uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
(numElements / eltsPerBlock2) :
(numElements / eltsPerBlock2 + 1);
bool loop = numBlocks > 65535;
uint blocks = loop ? 65535 : numBlocks;
uint blocksFind = loop ? 65535 : numBlocks2;
uint blocksReorder = loop ? 65535 : numBlocks2;
uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
if (persist)
{
loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
blocks = numBlocks;
blocksFind = numBlocks2;
blocksReorder = numBlocks2;
// Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
// on GT200, resulting in better scheduling and lower run times
if (startbit > 0)
{
emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
}
}
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) :
numCTAs(radixSortBlocks<4, 0, true, false, true>);
}
radixSortBlocks<nbits, startbit, true, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
else
{
radixSortBlocks<nbits, startbit, true, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
}
else
{
if (loop)
{
if (persist)
{
blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) :
numCTAs(radixSortBlocks<4, 0, false, false, true>);
}
radixSortBlocks<nbits, startbit, false, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
else
{
radixSortBlocks<nbits, startbit, false, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
}
}
CUT_CHECK_ERROR("radixSortBlocks");
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, true, true>);
}
findRadixOffsets<startbit, true, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
{
findRadixOffsets<startbit, true, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
}
else
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, false, true>);
}
findRadixOffsets<startbit, false, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
{
findRadixOffsets<startbit, false, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
}
CUT_CHECK_ERROR("findRadixOffsets");
cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
if (fullBlocks)
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
numCTAs(reorderData<0, true, true, false, true>);
}
reorderData<startbit, true, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, true, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
numCTAs(reorderData<0, true, false, false, true>);
}
reorderData<startbit, true, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, true, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
}
else
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderData<0, false, true, true, true>) :
numCTAs(reorderData<0, false, true, false, true>);
}
reorderData<startbit, false, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, false, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderData<0, false, false, true, true>) :
numCTAs(reorderData<0, false, false, false, true>);
}
reorderData<startbit, false, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
else
{
reorderData<startbit, false, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues,
plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
}
}
}
CUT_CHECK_ERROR("radixSortStep");
}
/**
* @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param numElements Number of elements in the sort.
**/
template <bool flip>
void radixSortSingleBlock(uint *keys,
uint *values,
uint numElements)
{
bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
if (fullBlocks)
{
radixSortBlocks<32, 0, true, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)values,
(uint4*)keys, (uint4*)values,
numElements, 0);
}
else
{
radixSortBlocks<32, 0, false, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)values,
(uint4*)keys, (uint4*)values,
numElements, 0);
}
if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
CUT_CHECK_ERROR("radixSortSingleBlock");
}
/**
* @brief Main radix sort function
*
* Main radix sort function. Sorts in place in the keys and values arrays,
* but uses the other device arrays as temporary storage. All pointer
* parameters are device pointers. Uses cudppScan() for the prefix sum of
* radix counters.
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
* @param[in] flipBits Is set true if key datatype is a float
* (neg. numbers) for special float sorting operations.
* @param[in] keyBits Number of interesting bits in the key
**/
void radixSort(uint *keys,
uint* values,
const CUDPPRadixSortPlan *plan,
size_t numElements,
bool flipBits,
int keyBits)
{
if(numElements <= WARP_SIZE)
{
if (flipBits)
radixSortSingleWarp<true><<<1, numElements>>>
(keys, values, numElements);
else
radixSortSingleWarp<false><<<1, numElements>>>
(keys, values, numElements);
CUT_CHECK_ERROR("radixSortSingleWarp");
return;
}
#ifdef __DEVICE_EMULATION__
printf("bits: %d\n", keyBits);
#endif
if(numElements <= SORT_CTA_SIZE * 4)
{
if (flipBits)
radixSortSingleBlock<true>(keys, values, numElements);
else
radixSortSingleBlock<false>(keys, values, numElements);
return;
}
// flip float bits on the first pass, unflip on the last pass
if (flipBits)
{
radixSortStep<4, 0, true, false>
(keys, values, plan, numElements);
}
else
{
radixSortStep<4, 0, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 4)
{
radixSortStep<4, 4, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 8)
{
radixSortStep<4, 8, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 12)
{
radixSortStep<4, 12, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 16)
{
radixSortStep<4, 16, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 20)
{
radixSortStep<4, 20, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 24)
{
radixSortStep<4, 24, false, false>
(keys, values, plan, numElements);
}
if (keyBits > 28)
{
if (flipBits) // last pass
{
radixSortStep<4, 28, false, true>
(keys, values, plan, numElements);
}
else
{
radixSortStep<4, 28, false, false>
(keys, values, plan, numElements);
}
}
}
/**
* @brief Wrapper to call main radix sort function. For float configuration.
*
* Calls the main radix sort function. For float configuration.
*
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
* @param[in] negativeKeys Is set true if key datatype has neg. numbers.
* @param[in] keyBits Number of interesting bits in the key
**/
extern "C"
void radixSortFloatKeys(float* keys,
uint* values,
const CUDPPRadixSortPlan *plan,
size_t numElements,
bool negativeKeys,
int keyBits)
{
radixSort((uint*)keys, (uint*)values, plan,
numElements, negativeKeys, keyBits);
}
/** @brief Perform one step of the radix sort. Sorts by nbits key bits per step,
* starting at startbit.
*
* @param[in,out] keys Keys to be sorted.
* @param[in] plan Configuration information for RadixSort.
* @param[in] numElements Number of elements in the sort.
**/
template<uint nbits, uint startbit, bool flip, bool unflip>
void radixSortStepKeysOnly(uint *keys,
const CUDPPRadixSortPlan *plan,
uint numElements)
{
const uint eltsPerBlock = SORT_CTA_SIZE * 4;
const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
bool fullBlocks = ((numElements % eltsPerBlock) == 0);
uint numBlocks = (fullBlocks) ?
(numElements / eltsPerBlock) :
(numElements / eltsPerBlock + 1);
uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
(numElements / eltsPerBlock2) :
(numElements / eltsPerBlock2 + 1);
bool loop = numBlocks > 65535;
uint blocks = loop ? 65535 : numBlocks;
uint blocksFind = loop ? 65535 : numBlocks2;
uint blocksReorder = loop ? 65535 : numBlocks2;
uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
if (persist)
{
loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
blocks = numBlocks;
blocksFind = numBlocks2;
blocksReorder = numBlocks2;
}
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) :
numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
}
radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
else
radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
else
{
if (loop)
{
if (persist)
{
blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) :
numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
}
radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
else
radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
<<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
}
if (fullBlocks)
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, true, true>);
}
findRadixOffsets<startbit, true, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
findRadixOffsets<startbit, true, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
{
if (loop)
{
if (persist)
{
blocksFind = numCTAs(findRadixOffsets<0, false, true>);
}
findRadixOffsets<startbit, false, true>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
else
findRadixOffsets<startbit, false, false>
<<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
}
cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
if (fullBlocks)
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, true, true, true, true>) :
numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
}
reorderDataKeysOnly<startbit, true, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, true, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
}
reorderDataKeysOnly<startbit, true, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, true, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
}
else
{
if (plan->m_bManualCoalesce)
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
}
reorderDataKeysOnly<startbit, false, true, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, false, true, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
{
if (loop)
{
if (persist)
{
blocksReorder = unflip ?
numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
}
reorderDataKeysOnly<startbit, false, false, unflip, true>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
else
reorderDataKeysOnly<startbit, false, false, unflip, false>
<<<blocksReorder, SORT_CTA_SIZE>>>
(keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters,
numElements, numBlocks2);
}
}
CUT_CHECK_ERROR("radixSortStepKeysOnly");
}
/**
* @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
*
* @param[in,out] keys Keys to be sorted.
* @param numElements Number of elements in the sort.
**/
template <bool flip>
void radixSortSingleBlockKeysOnly(uint *keys,
uint numElements)
{
bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
if (fullBlocks)
{
radixSortBlocksKeysOnly<32, 0, true, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)keys, numElements, 1 );
}
else
{
radixSortBlocksKeysOnly<32, 0, false, flip, false>
<<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
((uint4*)keys, (uint4*)keys, numElements, 1 );
}
if (flip)
unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
CUT_CHECK_ERROR("radixSortSingleBlock");
}
/**
* @brief Main radix sort function. For keys only configuration.
*
* Main radix sort function. Sorts in place in the keys array,
* but uses the other device arrays as temporary storage. All pointer
* parameters are device pointers. Uses scan for the prefix sum of
* radix counters.
*
* @param[in,out] keys Keys to be sorted.
* @param[in] plan Configuration information for RadixSort.
* @param[in] flipBits Is set true if key datatype is a float (neg. numbers)
* for special float sorting operations.
* @param[in] numElements Number of elements in the sort.
* @param[in] keyBits Number of interesting bits in the key
**/
extern "C"
void radixSortKeysOnly(uint *keys,
const CUDPPRadixSortPlan *plan,
bool flipBits,
size_t numElements,
int keyBits)
{
if(numElements <= WARP_SIZE)
{
if (flipBits)
radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
else
radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
return;
}
if(numElements <= SORT_CTA_SIZE * 4)
{
if (flipBits)
radixSortSingleBlockKeysOnly<true>(keys, numElements);
else
radixSortSingleBlockKeysOnly<false>(keys, numElements);
return;
}
// flip float bits on the first pass, unflip on the last pass
if (flipBits)
{
radixSortStepKeysOnly<4, 0, true, false>(keys, plan, numElements);
}
else
{
radixSortStepKeysOnly<4, 0, false, false>(keys, plan, numElements);
}
if (keyBits > 4)
{
radixSortStepKeysOnly<4, 4, false, false>(keys, plan, numElements);
}
if (keyBits > 8)
{
radixSortStepKeysOnly<4, 8, false, false>(keys, plan, numElements);
}
if (keyBits > 12)
{
radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
}
if (keyBits > 16)
{
radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
}
if (keyBits > 20)
{
radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
}
if (keyBits > 24)
{
radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
}
if (keyBits > 28)
{
if (flipBits) // last pass
{
radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
}
else
{
radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
}
}
}
/**
* @brief Wrapper to call main radix sort function. For floats and keys only.
*
* Calls the radixSortKeysOnly function setting parameters for floats.
*
* @param[in,out] keys Keys to be sorted.
* @param[in] plan Configuration information for RadixSort.
* @param[in] negativeKeys Is set true if key flipBits is to be true in
* radixSortKeysOnly().
* @param[in] numElements Number of elements in the sort.
* @param[in] keyBits Number of interesting bits in the key
**/
extern "C"
void radixSortFloatKeysOnly(float *keys,
const CUDPPRadixSortPlan *plan,
bool negativeKeys,
size_t numElements,
int keyBits)
{
radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
}
extern "C"
void initDeviceParameters(CUDPPRadixSortPlan *plan)
{
int deviceID = -1;
if (cudaSuccess == cudaGetDevice(&deviceID))
{
cudaDeviceProp devprop;
cudaGetDeviceProperties(&devprop, deviceID);
int smVersion = devprop.major * 10 + devprop.minor;
// sm_12 and later devices don't need help with coalesce in reorderData kernel
plan->m_bManualCoalesce = (smVersion < 12);
// sm_20 and later devices are better off not using persistent CTAs
plan->m_bUsePersistentCTAs = (smVersion < 20);
if (plan->m_bUsePersistentCTAs)
{
// The following is only true on pre-sm_20 devices (pre-Fermi):
// Empirically we have found that for some (usually larger) sort
// sizes it is better to use exactly as many "persistent" CTAs
// as can fill the GPU, which loop over the "blocks" of work. For smaller
// arrays it is better to use the typical CUDA approach of launching one CTA
// per block of work.
// 0-element of these two-element arrays is for key-value sorts
// 1-element is for key-only sorts
plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
// create a map of function pointers to register counts for more accurate occupancy calculation
// Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
// Note we only insert the "loop" version of the kernels (the one with the last template param = true)
// Because those are the only ones that require persistent CTAs that maximally fill the device.
computeNumCTAs(radixSortBlocks<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocks<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocks<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocks<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(findRadixOffsets<0, false, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(findRadixOffsets<0, true, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, false, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderData<0, true, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>, 0, SORT_CTA_SIZE);
computeNumCTAs(emptyKernel, 0, SORT_CTA_SIZE);
}
}
}
/**
* @brief From the programmer-specified sort configuration,
* creates internal memory for performing the sort.
*
* @param[in] plan Pointer to CUDPPRadixSortPlan object
**/
extern "C"
void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
{
unsigned int numElements = plan->m_numElements;
unsigned int numBlocks =
((numElements % (SORT_CTA_SIZE * 4)) == 0) ?
(numElements / (SORT_CTA_SIZE * 4)) :
(numElements / (SORT_CTA_SIZE * 4) + 1);
switch(plan->m_config.datatype)
{
case CUDPP_UINT:
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
numElements * sizeof(unsigned int)));
if (!plan->m_bKeysOnly)
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
numElements * sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
WARP_SIZE * numBlocks * sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
WARP_SIZE * numBlocks * sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
WARP_SIZE * numBlocks * sizeof(unsigned int)));
break;
case CUDPP_FLOAT:
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
numElements * sizeof(float)));
if (!plan->m_bKeysOnly)
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
numElements * sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
WARP_SIZE * numBlocks * sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
WARP_SIZE * numBlocks * sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
WARP_SIZE * numBlocks * sizeof(float)));
break;
}
initDeviceParameters(plan);
}
/** @brief Deallocates intermediate memory from allocRadixSortStorage.
*
*
* @param[in] plan Pointer to CUDPPRadixSortPlan object
**/
extern "C"
void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
{
CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
CUDA_SAFE_CALL( cudaFree(plan->m_counters));
CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
}
/** @brief Dispatch function to perform a sort on an array with
* a specified configuration.
*
* This is the dispatch routine which calls radixSort...() with
* appropriate template parameters and arguments as specified by
* the plan.
* @param[in,out] keys Keys to be sorted.
* @param[in,out] values Associated values to be sorted (through keys).
* @param[in] numElements Number of elements in the sort.
* @param[in] keyBits Number of interesting bits in the key*
* @param[in] plan Configuration information for RadixSort.
**/
extern "C"
void cudppRadixSortDispatch(void *keys,
void *values,
size_t numElements,
int keyBits,
const CUDPPRadixSortPlan *plan)
{
if(plan->m_bKeysOnly)
{
switch(plan->m_config.datatype)
{
case CUDPP_UINT:
radixSortKeysOnly((uint*)keys, plan, false,
numElements, keyBits);
break;
case CUDPP_FLOAT:
radixSortFloatKeysOnly((float*)keys, plan, true,
numElements, keyBits);
}
}
else
{
switch(plan->m_config.datatype)
{
case CUDPP_UINT:
radixSort((uint*)keys, (uint*) values, plan,
numElements, false, keyBits);
break;
case CUDPP_FLOAT:
radixSortFloatKeys((float*)keys, (uint*) values, plan,
numElements, true, keyBits);
}
}
}
/** @} */ // end radixsort functions
/** @} */ // end cudpp_app