From 4366bfffd35e92fe834b9ad75ab9179922886b7e Mon Sep 17 00:00:00 2001
From: pscrozi <pscrozi@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Tue, 23 Nov 2010 19:52:03 +0000
Subject: [PATCH] Getting rid of extra CR characters at ends of lines.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5285 f3b2605a-c512-4ea7-a41b-209d697bcdaa
---
 lib/gpu/atomic_gpu_memory.cpp                 |    4 +-
 lib/gpu/atomic_gpu_memory.h                   |    2 +-
 lib/gpu/charge_gpu_memory.cpp                 |    2 +-
 lib/gpu/charge_gpu_memory.h                   |    2 +-
 lib/gpu/cmm_cut_gpu.cpp                       |    2 +-
 lib/gpu/cmm_cut_gpu_memory.cpp                |    2 +-
 lib/gpu/cmm_cut_gpu_memory.h                  |    2 +-
 lib/gpu/cmmc_long_gpu.cpp                     |    2 +-
 lib/gpu/cmmc_long_gpu_memory.cpp              |    2 +-
 lib/gpu/cmmc_long_gpu_memory.h                |    2 +-
 lib/gpu/cudpp_mini/cta/radixsort_cta.cu       |  674 +++---
 lib/gpu/cudpp_mini/cta/scan_cta.cu            | 1238 +++++-----
 lib/gpu/cudpp_mini/cudpp.cpp                  |    2 +-
 lib/gpu/cudpp_mini/cudpp.h                    |    2 +-
 lib/gpu/cudpp_mini/cudpp_globals.h            |    2 +-
 lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp   |    2 +-
 lib/gpu/cudpp_mini/cudpp_maximal_launch.h     |    2 +-
 lib/gpu/cudpp_mini/cudpp_plan.cpp             |    2 +-
 lib/gpu/cudpp_mini/cudpp_plan.h               |    2 +-
 lib/gpu/cudpp_mini/cudpp_plan_manager.cpp     |    2 +-
 lib/gpu/cudpp_mini/cudpp_plan_manager.h       |    2 +-
 lib/gpu/cudpp_mini/cudpp_radixsort.h          |    2 +-
 lib/gpu/cudpp_mini/cudpp_scan.h               |    2 +-
 lib/gpu/cudpp_mini/cudpp_util.h               |    2 +-
 lib/gpu/cudpp_mini/cutil.h                    |    2 +-
 lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu | 1736 +++++++-------
 lib/gpu/cudpp_mini/license.txt                |    1 +
 lib/gpu/cudpp_mini/radixsort_app.cu           | 1986 ++++++++---------
 lib/gpu/cudpp_mini/sharedmem.h                |    2 +-
 lib/gpu/gb_gpu.cpp                            |    2 +-
 lib/gpu/gb_gpu_memory.cpp                     |    2 +-
 lib/gpu/geryon/VERSION.txt                    |    1 +
 lib/gpu/geryon/file_to_cstr.sh                |    2 +-
 lib/gpu/geryon/nvc_device.h                   |    2 +-
 lib/gpu/geryon/nvc_macros.h                   |    2 +-
 lib/gpu/geryon/nvc_texture.h                  |    2 +-
 lib/gpu/geryon/nvd_device.h                   |    2 +-
 lib/gpu/geryon/nvd_kernel.h                   |    2 +-
 lib/gpu/geryon/nvd_macros.h                   |    2 +-
 lib/gpu/geryon/nvd_mat.h                      |    2 +-
 lib/gpu/geryon/nvd_memory.h                   |    2 +-
 lib/gpu/geryon/nvd_texture.h                  |    2 +-
 lib/gpu/geryon/nvd_timer.h                    |    2 +-
 lib/gpu/geryon/ocl_device.h                   |    2 +-
 lib/gpu/geryon/ocl_kernel.h                   |    2 +-
 lib/gpu/geryon/ocl_mat.h                      |    2 +-
 lib/gpu/geryon/ocl_texture.h                  |    2 +-
 lib/gpu/geryon/ocl_timer.h                    |    2 +-
 lib/gpu/geryon/ucl_arg_kludge.h               |    2 +-
 lib/gpu/geryon/ucl_basemat.h                  |    2 +-
 lib/gpu/geryon/ucl_copy.h                     |    2 +-
 lib/gpu/geryon/ucl_d_mat.h                    |    2 +-
 lib/gpu/geryon/ucl_d_vec.h                    |    2 +-
 lib/gpu/geryon/ucl_get_devices.cpp            |    2 +-
 lib/gpu/geryon/ucl_h_mat.h                    |    2 +-
 lib/gpu/geryon/ucl_h_vec.h                    |    2 +-
 lib/gpu/geryon/ucl_nv_kernel.h                |    2 +-
 lib/gpu/geryon/ucl_print.h                    |    2 +-
 lib/gpu/geryon/ucl_types.h                    |    2 +-
 lib/gpu/lj96_cut_gpu.cpp                      |    2 +-
 lib/gpu/lj96_cut_gpu_memory.cpp               |    2 +-
 lib/gpu/lj96_cut_gpu_memory.h                 |    2 +-
 lib/gpu/lj_cut_gpu.cpp                        |    2 +-
 lib/gpu/lj_cut_gpu_memory.cpp                 |    2 +-
 lib/gpu/lj_cut_gpu_memory.h                   |    2 +-
 lib/gpu/ljc_cut_gpu.cpp                       |    2 +-
 lib/gpu/ljc_cut_gpu_memory.cpp                |    2 +-
 lib/gpu/ljc_cut_gpu_memory.h                  |    2 +-
 lib/gpu/ljcl_cut_gpu.cpp                      |    2 +-
 lib/gpu/ljcl_cut_gpu_memory.cpp               |    2 +-
 lib/gpu/ljcl_cut_gpu_memory.h                 |    2 +-
 lib/gpu/pair_gpu_atom.cpp                     |    2 +-
 lib/gpu/pair_gpu_balance.h                    |    2 +-
 lib/gpu/pair_gpu_device.cpp                   |    2 +-
 lib/gpu/pair_gpu_device.h                     |    2 +-
 lib/gpu/pair_gpu_nbor.cpp                     |    2 +-
 lib/gpu/pair_gpu_precision.h                  |    2 +-
 lib/gpu/pair_win_sort.cpp                     |    2 +-
 78 files changed, 2892 insertions(+), 2890 deletions(-)

diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp
index d23ac78523..e1cc48048b 100644
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@@ -10,11 +10,11 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
-
+ 
 #include "atomic_gpu_memory.h"
 #define AtomicGPUMemoryT AtomicGPUMemory<numtyp, acctyp>
 
diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h
index 91003f5c0d..81de41f3b7 100644
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp
index a14f7b7205..ce43fdfda1 100644
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h
index c53f897118..d18857e4d6 100644
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp
index 11e0e912eb..b09d713f91 100644
--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp
index bbef5e2119..e5a83e5872 100644
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/cmm_cut_gpu_memory.h b/lib/gpu/cmm_cut_gpu_memory.h
index b74b809e29..8099d5b9c4 100644
--- a/lib/gpu/cmm_cut_gpu_memory.h
+++ b/lib/gpu/cmm_cut_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/cmmc_long_gpu.cpp b/lib/gpu/cmmc_long_gpu.cpp
index a647a08e4d..11c527aac2 100644
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp
index 3625ef1caf..9a63bc5628 100644
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/cmmc_long_gpu_memory.h b/lib/gpu/cmmc_long_gpu_memory.h
index a9e6e56934..8192c78249 100644
--- a/lib/gpu/cmmc_long_gpu_memory.h
+++ b/lib/gpu/cmmc_long_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/cudpp_mini/cta/radixsort_cta.cu b/lib/gpu/cudpp_mini/cta/radixsort_cta.cu
index e7551b2407..de7bd1c1d3 100644
--- a/lib/gpu/cudpp_mini/cta/radixsort_cta.cu
+++ b/lib/gpu/cudpp_mini/cta/radixsort_cta.cu
@@ -1,337 +1,337 @@
-// -------------------------------------------------------------
-// CUDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision$
-// $Date$
-// ------------------------------------------------------------- 
-// This source code is distributed under the terms of license.txt 
-// in the root directory of this source distribution.
-// ------------------------------------------------------------- 
-#include <cudpp_globals.h>
-#include "cudpp_radixsort.h"
-#include "cta/scan_cta.cu"
-#include <cudpp.h>
-#include <stdio.h>
-
-#include <cudpp_util.h>
-#include <math.h>
-#include "sharedmem.h"
-
-
-#ifdef __DEVICE_EMULATION__
-#define __EMUSYNC __syncthreads()
-#else
-#define __EMUSYNC
-#endif
-
-/**
- * @file
- * sort_cta.cu
- * 
- * @brief CUDPP CTA-level sort routines
- */
-
-/** \addtogroup cudpp_cta 
-* @{
-*/
-
-/** @name Radix Sort Functions
-* @{
-*/
-
-
-typedef unsigned int uint;
-
-/**
- * @brief Flips bits of single-precision floating-point number (parameterized by doFlip)
- * 
- *  flip a float for sorting
- *  finds SIGN of fp number.
- *  if it's 1 (negative float), it flips all bits
- *  if it's 0 (positive float), it flips the sign only
- * @param[in] f floating-point input (passed as unsigned int)
- * @see floatUnflip
-**/
-
-template <bool doFlip>
-__device__ uint floatFlip(uint f)
-{
-    if (doFlip)
-    {
-        uint mask = -int(f >> 31) | 0x80000000;
-        return f ^ mask;
-    }
-    else
-        return f;
-}
-
-/**
- * @brief Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)
- * 
- * flip a float back (invert FloatFlip)
- *  signed was flipped from above, so:
- *  if sign is 1 (negative), it flips the sign bit back
- *  if sign is 0 (positive), it flips all bits back
- * @param[in] f floating-point input (passed as unsigned int)
- * @see floatFlip
-**/
-template <bool doFlip>
-__device__ uint floatUnflip(uint f)
-{
-    if (doFlip)
-    {
-        uint mask = ((f >> 31) - 1) | 0x80000000;
-        return f ^ mask;
-    }
-    else
-        return f;
-}
-
-/**
- * @brief Scans one warp quickly, optimized for 32-element warps, using shared memory
- * 
- * Scans each warp in parallel ("warp-scan"), one element per thread.
- * uses 2 numElements of shared memory per thread (64 numElements per warp)
- * 
- * @param[in] val Elements per thread to scan
- * @param[in,out] sData
-**/
-template<class T, int maxlevel>
-__device__ T scanwarp(T val, volatile T* sData)
-{
-    // The following is the same as 2 * WARP_SIZE * warpId + threadInWarp = 
-    // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE - 1))
-    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
-    sData[idx] = 0;
-    idx += WARP_SIZE;
-    T t = sData[idx] = val;          __EMUSYNC;
-
-#ifdef __DEVICE_EMULATION__             
-        t = sData[idx -  1]; __EMUSYNC; 
-        sData[idx] += t;       __EMUSYNC;
-        t = sData[idx -  2];   __EMUSYNC; 
-        sData[idx] += t;       __EMUSYNC;
-        t = sData[idx -  4];   __EMUSYNC; 
-        sData[idx] += t;       __EMUSYNC;
-        t = sData[idx -  8];   __EMUSYNC; 
-        sData[idx] += t;       __EMUSYNC;
-        t = sData[idx - 16];   __EMUSYNC; 
-        sData[idx] += t;       __EMUSYNC;
-#else
-        if (0 <= maxlevel) { sData[idx] = t = t + sData[idx - 1]; } __EMUSYNC;
-        if (1 <= maxlevel) { sData[idx] = t = t + sData[idx - 2]; } __EMUSYNC;
-        if (2 <= maxlevel) { sData[idx] = t = t + sData[idx - 4]; } __EMUSYNC;
-        if (3 <= maxlevel) { sData[idx] = t = t + sData[idx - 8]; } __EMUSYNC;
-        if (4 <= maxlevel) { sData[idx] = t = t + sData[idx -16]; } __EMUSYNC;
-#endif          
-        return sData[idx] - val;  // convert inclusive -> exclusive
-}
-
-/**
- * @brief Scans 4*CTA_SIZE unsigned ints in a block
- *
- * scan4 scans 4*CTA_SIZE numElements in a block (4 per
- * thread), using a warp-scan algorithm
- * 
- * @param[in] idata 4-vector of integers to scan
-**/
-__device__ uint4 scan4(uint4 idata)
-{    
-    extern  __shared__  uint ptr[];
-    
-    uint idx = threadIdx.x;
-
-    uint4 val4 = idata;
-    uint sum[3];
-    sum[0] = val4.x;
-    sum[1] = val4.y + sum[0];
-    sum[2] = val4.z + sum[1];
-    
-    uint val = val4.w + sum[2];
-    
-    val = scanwarp<uint, 4>(val, ptr);
-    __syncthreads();
-
-    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
-    {
-        ptr[idx >> 5] = val + val4.w + sum[2];
-    }
-    __syncthreads();
-
-#ifndef __DEVICE_EMULATION__
-    if (idx < WARP_SIZE)
-#endif
-    {
-        ptr[idx] = scanwarp<uint, 2>(ptr[idx], ptr);
-    }
-    __syncthreads();
-
-    val += ptr[idx >> 5];
-
-    val4.x = val;
-    val4.y = val + sum[0];
-    val4.z = val + sum[1];
-    val4.w = val + sum[2];      
-        
-    return val4;
-}
-
-/**
- * @brief Computes output position for each thread given predicate; trues come first then falses
- * 
- * Rank is the core of the radix sort loop.  Given a predicate, it
- * computes the output position for each thread in an ordering where all
- * True threads come first, followed by all False threads. 
- * This version handles 4 predicates per thread; hence, "rank4".
- *
- * @param[in] preds true/false values for each of the 4 elements in this thread
- *
- * @todo is the description of "preds" correct?
-**/
-template <int ctasize>
-__device__ uint4 rank4(uint4 preds)
-{
-    uint4 address = scan4(preds);  
-
-    __shared__ uint numtrue;
-    if (threadIdx.x == ctasize-1)
-    {
-        numtrue = address.w + preds.w;
-    }
-    __syncthreads();
-
-    uint4 rank;
-    uint idx = threadIdx.x << 2;
-    rank.x = (preds.x) ? address.x : numtrue + idx   - address.x;
-    rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
-    rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
-    rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;     
-                
-    return rank;
-}
-
-/**
- * @brief Sorts one block
- *
- * Uses rank to sort one bit at a time: Sorts a block according
- * to bits startbit -> nbits + startbit
- * @param[in,out] key
- * @param[in,out] value
-**/
-template<uint nbits, uint startbit>
-__device__ void radixSortBlock(uint4 &key, uint4 &value)
-{
-    extern __shared__ uint sMem1[];
-    for(uint shift = startbit; shift < (startbit + nbits); ++shift)
-    {        
-        uint4 lsb;
-        lsb.x = !((key.x >> shift) & 0x1);
-        lsb.y = !((key.y >> shift) & 0x1);
-        lsb.z = !((key.z >> shift) & 0x1);
-        lsb.w = !((key.w >> shift) & 0x1); 
-
-        uint4 r = rank4<256>(lsb);
-
-#if 1
-        // This arithmetic strides the ranks across 4 SORT_CTA_SIZE regions
-        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
-        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
-        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
-        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w; 
-        __syncthreads();
-
-        // The above allows us to read without 4-way bank conflicts:
-        key.x = sMem1[threadIdx.x];
-        key.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
-        key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
-        key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
-
-        __syncthreads();
-
-        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = value.x;
-        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = value.y;
-        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = value.z;
-        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = value.w;
-        __syncthreads();
-
-        value.x = sMem1[threadIdx.x];
-        value.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
-        value.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
-        value.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
-#else
-        sMem1[r.x] = key.x;
-        sMem1[r.y] = key.y;
-        sMem1[r.z] = key.z;
-        sMem1[r.w] = key.w;
-        __syncthreads();
-
-        // This access has 4-way bank conflicts
-        key = sMem[threadIdx.x];
-
-        __syncthreads();
-
-        sMem1[r.x] = value.x;
-        sMem1[r.y] = value.y;
-        sMem1[r.z] = value.z;
-        sMem1[r.w] = value.w;
-        __syncthreads();
-
-        value = sMem[threadIdx.x];
-#endif
-
-        __syncthreads();
-    }
-}
-
-/**
- * @brief Sorts one block. Key-only version.
- *
- * Uses rank to sort one bit at a time: Sorts a block according
- * to bits startbit -> nbits + startbit
- * @param[in,out] key
-**/
-
-template<uint nbits, uint startbit>
-__device__ void radixSortBlockKeysOnly(uint4 &key)
-{
-    extern __shared__ uint sMem1[];
-    for(uint shift = startbit; shift < (startbit + nbits); ++shift)
-    {                   
-        uint4 lsb;
-        lsb.x = !((key.x >> shift) & 0x1);
-        lsb.y = !((key.y >> shift) & 0x1);
-        lsb.z = !((key.z >> shift) & 0x1);
-        lsb.w = !((key.w >> shift) & 0x1);
-
-        uint4 r = rank4<256>(lsb);
-
-#if 1
-        // This arithmetic strides the ranks across 4 CTA_SIZE regions
-        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
-        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
-        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
-        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w;
-        __syncthreads();
-
-        // The above allows us to read without 4-way bank conflicts:
-        key.x = sMem1[threadIdx.x];
-        key.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
-        key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
-        key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
-#else
-        sMem1[r.x] = key.x;
-        sMem1[r.y] = key.y;
-        sMem1[r.z] = key.z;
-        sMem1[r.w] = key.w;
-        __syncthreads();
-
-        // This access has 4-way bank conflicts
-        key = sMem[threadIdx.x];
-#endif
-
-        __syncthreads();
-    }
-}
-
-/** @} */ // end radix sort functions
-/** @} */ // end cudpp_cta
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#include <cudpp_globals.h>
+#include "cudpp_radixsort.h"
+#include "cta/scan_cta.cu"
+#include <cudpp.h>
+#include <stdio.h>
+ 
+#include <cudpp_util.h>
+#include <math.h>
+#include "sharedmem.h"
+
+
+#ifdef __DEVICE_EMULATION__
+#define __EMUSYNC __syncthreads()
+#else
+#define __EMUSYNC
+#endif
+
+/**
+ * @file
+ * sort_cta.cu
+ * 
+ * @brief CUDPP CTA-level sort routines
+ */
+
+/** \addtogroup cudpp_cta 
+* @{
+*/
+
+/** @name Radix Sort Functions
+* @{
+*/
+
+
+typedef unsigned int uint;
+
+/**
+ * @brief Flips bits of single-precision floating-point number (parameterized by doFlip)
+ * 
+ *  flip a float for sorting
+ *  finds SIGN of fp number.
+ *  if it's 1 (negative float), it flips all bits
+ *  if it's 0 (positive float), it flips the sign only
+ * @param[in] f floating-point input (passed as unsigned int)
+ * @see floatUnflip
+**/
+
+template <bool doFlip>
+__device__ uint floatFlip(uint f)
+{
+    if (doFlip)
+    {
+        uint mask = -int(f >> 31) | 0x80000000;
+        return f ^ mask;
+    }
+    else
+        return f;
+}
+
+/**
+ * @brief Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)
+ * 
+ * flip a float back (invert FloatFlip)
+ *  signed was flipped from above, so:
+ *  if sign is 1 (negative), it flips the sign bit back
+ *  if sign is 0 (positive), it flips all bits back
+ * @param[in] f floating-point input (passed as unsigned int)
+ * @see floatFlip
+**/
+template <bool doFlip>
+__device__ uint floatUnflip(uint f)
+{
+    if (doFlip)
+    {
+        uint mask = ((f >> 31) - 1) | 0x80000000;
+        return f ^ mask;
+    }
+    else
+        return f;
+}
+
+/**
+ * @brief Scans one warp quickly, optimized for 32-element warps, using shared memory
+ * 
+ * Scans each warp in parallel ("warp-scan"), one element per thread.
+ * uses 2 numElements of shared memory per thread (64 numElements per warp)
+ * 
+ * @param[in] val Elements per thread to scan
+ * @param[in,out] sData
+**/
+template<class T, int maxlevel>
+__device__ T scanwarp(T val, volatile T* sData)
+{
+    // The following is the same as 2 * WARP_SIZE * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE - 1))
+    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
+    sData[idx] = 0;
+    idx += WARP_SIZE;
+    T t = sData[idx] = val;          __EMUSYNC;
+
+#ifdef __DEVICE_EMULATION__             
+        t = sData[idx -  1]; __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx -  2];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx -  4];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx -  8];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx - 16];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+#else
+        if (0 <= maxlevel) { sData[idx] = t = t + sData[idx - 1]; } __EMUSYNC;
+        if (1 <= maxlevel) { sData[idx] = t = t + sData[idx - 2]; } __EMUSYNC;
+        if (2 <= maxlevel) { sData[idx] = t = t + sData[idx - 4]; } __EMUSYNC;
+        if (3 <= maxlevel) { sData[idx] = t = t + sData[idx - 8]; } __EMUSYNC;
+        if (4 <= maxlevel) { sData[idx] = t = t + sData[idx -16]; } __EMUSYNC;
+#endif          
+        return sData[idx] - val;  // convert inclusive -> exclusive
+}
+
+/**
+ * @brief Scans 4*CTA_SIZE unsigned ints in a block
+ *
+ * scan4 scans 4*CTA_SIZE numElements in a block (4 per
+ * thread), using a warp-scan algorithm
+ * 
+ * @param[in] idata 4-vector of integers to scan
+**/
+__device__ uint4 scan4(uint4 idata)
+{    
+    extern  __shared__  uint ptr[];
+    
+    uint idx = threadIdx.x;
+
+    uint4 val4 = idata;
+    uint sum[3];
+    sum[0] = val4.x;
+    sum[1] = val4.y + sum[0];
+    sum[2] = val4.z + sum[1];
+    
+    uint val = val4.w + sum[2];
+    
+    val = scanwarp<uint, 4>(val, ptr);
+    __syncthreads();
+
+    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
+    {
+        ptr[idx >> 5] = val + val4.w + sum[2];
+    }
+    __syncthreads();
+
+#ifndef __DEVICE_EMULATION__
+    if (idx < WARP_SIZE)
+#endif
+    {
+        ptr[idx] = scanwarp<uint, 2>(ptr[idx], ptr);
+    }
+    __syncthreads();
+
+    val += ptr[idx >> 5];
+
+    val4.x = val;
+    val4.y = val + sum[0];
+    val4.z = val + sum[1];
+    val4.w = val + sum[2];      
+        
+    return val4;
+}
+
+/**
+ * @brief Computes output position for each thread given predicate; trues come first then falses
+ * 
+ * Rank is the core of the radix sort loop.  Given a predicate, it
+ * computes the output position for each thread in an ordering where all
+ * True threads come first, followed by all False threads. 
+ * This version handles 4 predicates per thread; hence, "rank4".
+ *
+ * @param[in] preds true/false values for each of the 4 elements in this thread
+ *
+ * @todo is the description of "preds" correct?
+**/
+template <int ctasize>
+__device__ uint4 rank4(uint4 preds)
+{
+    uint4 address = scan4(preds);  
+
+    __shared__ uint numtrue;
+    if (threadIdx.x == ctasize-1)
+    {
+        numtrue = address.w + preds.w;
+    }
+    __syncthreads();
+
+    uint4 rank;
+    uint idx = threadIdx.x << 2;
+    rank.x = (preds.x) ? address.x : numtrue + idx   - address.x;
+    rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
+    rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
+    rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;     
+                
+    return rank;
+}
+
+/**
+ * @brief Sorts one block
+ *
+ * Uses rank to sort one bit at a time: Sorts a block according
+ * to bits startbit -> nbits + startbit
+ * @param[in,out] key
+ * @param[in,out] value
+**/
+template<uint nbits, uint startbit>
+__device__ void radixSortBlock(uint4 &key, uint4 &value)
+{
+    extern __shared__ uint sMem1[];
+    for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+    {        
+        uint4 lsb;
+        lsb.x = !((key.x >> shift) & 0x1);
+        lsb.y = !((key.y >> shift) & 0x1);
+        lsb.z = !((key.z >> shift) & 0x1);
+        lsb.w = !((key.w >> shift) & 0x1); 
+
+        uint4 r = rank4<256>(lsb);
+
+#if 1
+        // This arithmetic strides the ranks across 4 SORT_CTA_SIZE regions
+        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
+        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
+        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
+        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w; 
+        __syncthreads();
+
+        // The above allows us to read without 4-way bank conflicts:
+        key.x = sMem1[threadIdx.x];
+        key.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
+        key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
+        key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
+
+        __syncthreads();
+
+        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = value.x;
+        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = value.y;
+        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = value.z;
+        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = value.w;
+        __syncthreads();
+
+        value.x = sMem1[threadIdx.x];
+        value.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
+        value.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
+        value.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
+#else
+        sMem1[r.x] = key.x;
+        sMem1[r.y] = key.y;
+        sMem1[r.z] = key.z;
+        sMem1[r.w] = key.w;
+        __syncthreads();
+
+        // This access has 4-way bank conflicts
+        key = sMem[threadIdx.x];
+
+        __syncthreads();
+
+        sMem1[r.x] = value.x;
+        sMem1[r.y] = value.y;
+        sMem1[r.z] = value.z;
+        sMem1[r.w] = value.w;
+        __syncthreads();
+
+        value = sMem[threadIdx.x];
+#endif
+
+        __syncthreads();
+    }
+}
+
+/**
+ * @brief Sorts one block. Key-only version.
+ *
+ * Uses rank to sort one bit at a time: Sorts a block according
+ * to bits startbit -> nbits + startbit
+ * @param[in,out] key
+**/
+
+template<uint nbits, uint startbit>
+__device__ void radixSortBlockKeysOnly(uint4 &key)
+{
+    extern __shared__ uint sMem1[];
+    for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+    {                   
+        uint4 lsb;
+        lsb.x = !((key.x >> shift) & 0x1);
+        lsb.y = !((key.y >> shift) & 0x1);
+        lsb.z = !((key.z >> shift) & 0x1);
+        lsb.w = !((key.w >> shift) & 0x1);
+
+        uint4 r = rank4<256>(lsb);
+
+#if 1
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
+        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
+        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
+        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w;
+        __syncthreads();
+
+        // The above allows us to read without 4-way bank conflicts:
+        key.x = sMem1[threadIdx.x];
+        key.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
+        key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
+        key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
+#else
+        sMem1[r.x] = key.x;
+        sMem1[r.y] = key.y;
+        sMem1[r.z] = key.z;
+        sMem1[r.w] = key.w;
+        __syncthreads();
+
+        // This access has 4-way bank conflicts
+        key = sMem[threadIdx.x];
+#endif
+
+        __syncthreads();
+    }
+}
+
+/** @} */ // end radix sort functions
+/** @} */ // end cudpp_cta
diff --git a/lib/gpu/cudpp_mini/cta/scan_cta.cu b/lib/gpu/cudpp_mini/cta/scan_cta.cu
index 850b4f2800..574639927f 100644
--- a/lib/gpu/cudpp_mini/cta/scan_cta.cu
+++ b/lib/gpu/cudpp_mini/cta/scan_cta.cu
@@ -1,619 +1,619 @@
-// ------------------------------------------------------------- 
-//  cuDPP -- CUDA Data Parallel Primitives library
-//  -------------------------------------------------------------
-//  $Revision: 5633 $
-//  $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
-// ------------------------------------------------------------- 
-// This source code is distributed under the terms of license.txt 
-// in the root directory of this source distribution.
-// ------------------------------------------------------------- 
-
-/**
- * @file
- * scan_cta.cu
- *
- * @brief CUDPP CTA-level scan routines
- */
-
-/** \defgroup cudpp_cta CUDPP CTA-Level API
-  * The CUDPP CTA-Level API contains functions that run on the GPU 
-  * device.  These are CUDA \c __device__ functions that are called
-  * from within other CUDA device functions (typically 
-  * \link cudpp_kernel CUDPP Kernel-Level API\endlink functions).
-  * They are called CTA-level functions because they typically process
-  * s_data "owned" by each CTA within shared memory, and are agnostic of
-  * any other CTAs that may be running (or how many CTAs are running),
-  * other than to compute appropriate global memory addresses.
-  * @{
-  */
-
-/** @name Scan Functions
-* @{
-*/
-
-#include <cudpp_globals.h>
-#include <cudpp_util.h>
-#include <math.h>
-#include <cudpp.h>
-
-/**
- * @brief Macro to insert necessary __syncthreads() in device emulation mode
- */
-#ifdef __DEVICE_EMULATION__
-#define __EMUSYNC __syncthreads()
-#else
-#define __EMUSYNC
-#endif
-
-/** 
-  * @brief Template class containing compile-time parameters to the scan functions
-  *
-  * ScanTraits is passed as a template parameter to all scan functions.  By 
-  * using these compile-time functions we can enable generic code while 
-  * maintaining the highest performance.  This is crucial for the performance 
-  * of low-level workhorse algorithms like scan.
-  *
-  * @param T The datatype of the scan
-  * @param oper The ::CUDPPOperator to use for the scan (add, max, etc.)
-  * @param multiRow True if this is a multi-row scan
-  * @param unroll True if scan inner loops should be unrolled
-  * @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans)
-  * @param backward True if this is a backward scan
-  * @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements)
-  * @param exclusive True for exclusive scans, false for inclusive scans
-  */
-template <class T, CUDPPOperator oper, bool backward, bool exclusive,
-          bool multiRow, bool sums, bool fullBlock>
-class ScanTraits
-{
-public:
-    
-    //! Returns true if this is a backward scan
-    static __device__ bool isBackward()    { return backward; };
-    //! Returns true if this is an exclusive scan
-    static __device__ bool isExclusive()  { return exclusive; };
-    //! Returns true if this a multi-row scan.
-    static __device__ bool isMultiRow()    { return multiRow; };
-    //! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans)
-    static __device__ bool writeSums()     { return sums; };
-    //! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements
-    static __device__ bool isFullBlock()   { return fullBlock; };
-    
-        
-    //! The operator function used for the scan
-    static __device__ T op(const T a, const T b)
-    {
-        return Operator<T, oper>::op(a, b);
-    }  
-
-    //! The identity value used by the scan
-    static __device__ T identity() { return Operator<T, oper>::identity(); }
-};
-
-//! This is used to insert syncthreads to avoid perf loss caused by 128-bit 
-//! load overlap that happens on G80.  This gives about a 15% boost on scans on 
-//! G80.
-//! @todo Parameterize this in case this perf detail changes on future GPUs.
-#define DISALLOW_LOADSTORE_OVERLAP 1
-
-/**
-* @brief Handles loading input s_data from global memory to shared memory 
-* (vec4 version)
-*
-* Load a chunk of 8*blockDim.x elements from global memory into a 
-* shared memory array.  Each thread loads two T4 elements (where
-* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in 
-* thread local arrays (in registers), and writes the two total sums of the
-* vec4s into shared memory, where they will be cooperatively scanned with 
-* the other partial sums by all threads in the CTA.
-*
-* @param[out] s_out The output (shared) memory array
-* @param[out] threadScan0 Intermediate per-thread partial sums array 1
-* @param[out] threadScan1 Intermediate per-thread partial sums array 2
-* @param[in] d_in The input (device) memory array
-* @param[in] numElements The number of elements in the array being scanned
-* @param[in] iDataOffset the offset of the input array in global memory for this 
-* thread block
-* @param[out] ai The shared memory address for the thread's first element 
-* (returned for reuse)
-* @param[out] bi The shared memory address for the thread's second element 
-* (returned for reuse)
-* @param[out] aiDev The device memory address for this thread's first element 
-* (returned for reuse)
-* @param[out] biDev The device memory address for this thread's second element 
-* (returned for reuse)
-*/
-template <class T, class traits> 
-__device__ void loadSharedChunkFromMem4(T        *s_out,
-                                        T        threadScan0[4],
-                                        T        threadScan1[4],
-                                        const T  *d_in,
-                                        int      numElements, 
-                                        int      iDataOffset,
-                                        int      &ai, 
-                                        int      &bi, 
-                                        int      &aiDev, 
-                                        int      &biDev)
-{
-    int thid = threadIdx.x;
-    aiDev = iDataOffset + thid;
-    biDev = aiDev + blockDim.x;
-
-    // convert to 4-vector
-    typename typeToVector<T,4>::Result  tempData;
-    typename typeToVector<T,4>::Result* inData = (typename typeToVector<T,4>::Result*)d_in;
-
-    ai = thid;
-    bi = thid + blockDim.x;
-
-    // read into tempData;
-    if (traits::isBackward())
-    {
-        int i = aiDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements) 
-        {
-            tempData       = inData[aiDev];
-            threadScan0[3] = tempData.w;               
-            threadScan0[2] = traits::op(tempData.z, threadScan0[3]);
-            threadScan0[1] = traits::op(tempData.y, threadScan0[2]);
-            threadScan0[0] = s_out[ai] 
-                           = traits::op(tempData.x, threadScan0[1]);
-        }
-        else
-        {
-            threadScan0[3] = traits::identity();
-            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]);
-            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]);
-            threadScan0[0] = s_out[ai] 
-                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan0[1]);
-        }
-
-#ifdef DISALLOW_LOADSTORE_OVERLAP
-        __syncthreads();
-#endif
-
-        i = biDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements)
-        {
-            tempData       = inData[biDev];
-            threadScan1[3] = tempData.w;
-            threadScan1[2] = traits::op(tempData.z, threadScan1[3]);
-            threadScan1[1] = traits::op(tempData.y, threadScan1[2]);
-            threadScan1[0] = s_out[bi] 
-                           = traits::op(tempData.x, threadScan1[1]);
-        }
-        else
-        {
-            threadScan1[3] = traits::identity();
-            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]);
-            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]);
-            threadScan1[0] = s_out[bi] 
-                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan1[1]);
-        }
-        __syncthreads();
-
-        // reverse s_data in shared memory
-        if (ai < CTA_SIZE)
-        {       
-            unsigned int leftIdx = ai;
-            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
-                
-            if (leftIdx < rightIdx) 
-            {
-                T tmp           = s_out[leftIdx];
-                s_out[leftIdx]  = s_out[rightIdx];
-                s_out[rightIdx] = tmp;
-            }
-        }
-        __syncthreads();
-    }
-    else
-    {
-        int i = aiDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements)
-        {
-            tempData       = inData[aiDev];
-            threadScan0[0] = tempData.x;           
-            threadScan0[1] = traits::op(tempData.y, threadScan0[0]);
-            threadScan0[2] = traits::op(tempData.z, threadScan0[1]);
-            threadScan0[3] = s_out[ai] 
-                           = traits::op(tempData.w, threadScan0[2]);
-        }
-        else
-        {
-            threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity();
-            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]);
-            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]);
-            threadScan0[3] = s_out[ai] 
-                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]);
-        }
-
-        
-#ifdef DISALLOW_LOADSTORE_OVERLAP
-        __syncthreads();
-#endif
-
-        i = biDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements)
-        {
-            tempData       = inData[biDev];
-            threadScan1[0] = tempData.x;           
-            threadScan1[1] = traits::op(tempData.y, threadScan1[0]);
-            threadScan1[2] = traits::op(tempData.z, threadScan1[1]);
-            threadScan1[3] = s_out[bi] 
-                           = traits::op(tempData.w, threadScan1[2]);
-        }
-        else
-        {
-            threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity();
-            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]);
-            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]);
-            threadScan1[3] = s_out[bi] 
-                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]);
-        }  
-        __syncthreads();
-    }
-}
-
-
-/**
-* @brief Handles storing result s_data from shared memory to global memory 
-* (vec4 version)
-*
-* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory 
-* into a device memory array.  Each thread stores reads two elements from shared
-* memory, adds them to the intermediate sums computed in 
-* loadSharedChunkFromMem4(), and writes two T4 elements (where
-* T4 is, e.g. int4 or float4) to global memory.
-*
-* @param[out] d_out The output (device) memory array
-* @param[in] threadScan0 Intermediate per-thread partial sums array 1
-* (contents computed in loadSharedChunkFromMem4())
-* @param[in] threadScan1 Intermediate per-thread partial sums array 2
-* (contents computed in loadSharedChunkFromMem4())
-* @param[in] s_in The input (shared) memory array
-* @param[in] numElements The number of elements in the array being scanned
-* @param[in] oDataOffset the offset of the output array in global memory 
-* for this thread block
-* @param[in] ai The shared memory address for the thread's first element 
-* (computed in loadSharedChunkFromMem4())
-* @param[in] bi The shared memory address for the thread's second element 
-* (computed in loadSharedChunkFromMem4())
-* @param[in] aiDev The device memory address for this thread's first element 
-* (computed in loadSharedChunkFromMem4())
-* @param[in] biDev The device memory address for this thread's second element 
-* (computed in loadSharedChunkFromMem4())
-*/
-template <class T, class traits>
-__device__ void storeSharedChunkToMem4(T   *d_out,
-                                       T   threadScan0[4],
-                                       T   threadScan1[4],
-                                       T   *s_in,
-                                       int numElements, 
-                                       int oDataOffset,
-                                       int ai, 
-                                       int bi, 
-                                       int aiDev, 
-                                       int biDev)
-{
-    // Convert to 4-vector
-    typename typeToVector<T,4>::Result tempData;
-    typename typeToVector<T,4>::Result* outData = (typename typeToVector<T,4>::Result*)d_out;
-
-    // write results to global memory
-    if (traits::isBackward())
-    {   
-        if (ai < CTA_SIZE)
-        {
-
-            unsigned int leftIdx = ai;
-            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
-            
-            if (leftIdx < rightIdx) 
-            {
-                T tmp = s_in[leftIdx];
-                s_in[leftIdx] = s_in[rightIdx];
-                s_in[rightIdx] = tmp;
-            }
-        }
-        __syncthreads();
-
-        T temp = s_in[ai];
-
-        if (traits::isExclusive())
-        {
-            tempData.w = temp;
-            tempData.z = traits::op(temp, threadScan0[3]);
-            tempData.y = traits::op(temp, threadScan0[2]);
-            tempData.x = traits::op(temp, threadScan0[1]);
-        }
-        else
-        {
-            tempData.w = traits::op(temp, threadScan0[3]);
-            tempData.z = traits::op(temp, threadScan0[2]);
-            tempData.y = traits::op(temp, threadScan0[1]);
-            tempData.x = traits::op(temp, threadScan0[0]);
-        }
-
-        int i = aiDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements)
-        {
-            outData[aiDev] = tempData;
-        }
-        else
-        {
-            if (i   < numElements) { d_out[i]   = tempData.x;
-            if (i+1 < numElements) { d_out[i+1] = tempData.y;
-            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}     
-        }
-
-#ifdef DISALLOW_LOADSTORE_OVERLAP
-        __syncthreads();
-#endif
-
-        temp = s_in[bi];
-
-        if (traits::isExclusive())
-        {
-            tempData.w = temp;
-            tempData.z = traits::op(temp, threadScan1[3]);
-            tempData.y = traits::op(temp, threadScan1[2]);
-            tempData.x = traits::op(temp, threadScan1[1]);
-        }
-        else
-        {
-            tempData.w = traits::op(temp, threadScan1[3]);
-            tempData.z = traits::op(temp, threadScan1[2]);
-            tempData.y = traits::op(temp, threadScan1[1]);
-            tempData.x = traits::op(temp, threadScan1[0]);
-        }
-
-        i = biDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements)
-        {
-            outData[biDev] = tempData;
-        }
-        else
-        {
-            if (i   < numElements) { d_out[i]   = tempData.x;
-            if (i+1 < numElements) { d_out[i+1] = tempData.y;
-            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}     
-        }
-    }
-    else
-    {
-        T temp;
-        temp = s_in[ai]; 
-
-        if (traits::isExclusive())
-        {
-            tempData.x = temp;
-            tempData.y = traits::op(temp, threadScan0[0]);
-            tempData.z = traits::op(temp, threadScan0[1]);
-            tempData.w = traits::op(temp, threadScan0[2]);
-        }
-        else
-        {
-            tempData.x = traits::op(temp, threadScan0[0]);
-            tempData.y = traits::op(temp, threadScan0[1]);
-            tempData.z = traits::op(temp, threadScan0[2]);
-            tempData.w = traits::op(temp, threadScan0[3]);
-        }
-
-        int i = aiDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements)
-        {                       
-            outData[aiDev] = tempData; 
-        }
-        else 
-        {       
-            // we can't use vec4 because the original array isn't a multiple of 
-            // 4 elements
-            if ( i    < numElements) { d_out[i]   = tempData.x;
-            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
-            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
-        }
-
-#ifdef DISALLOW_LOADSTORE_OVERLAP
-        __syncthreads();
-#endif
-
-        temp       = s_in[bi]; 
-
-        if (traits::isExclusive())
-        {
-            tempData.x = temp;
-            tempData.y = traits::op(temp, threadScan1[0]);
-            tempData.z = traits::op(temp, threadScan1[1]);
-            tempData.w = traits::op(temp, threadScan1[2]);
-        }
-        else
-        {
-            tempData.x = traits::op(temp, threadScan1[0]);
-            tempData.y = traits::op(temp, threadScan1[1]);
-            tempData.z = traits::op(temp, threadScan1[2]);
-            tempData.w = traits::op(temp, threadScan1[3]);
-        }
-
-        i = biDev * 4;
-        if (traits::isFullBlock() || i + 3 < numElements)
-        {
-            outData[biDev] = tempData;
-        }
-        else 
-        {
-            // we can't use vec4 because the original array isn't a multiple of 
-            // 4 elements
-            if ( i    < numElements) { d_out[i]   = tempData.x;
-            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
-            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
-        }
-    }
-}
-
-/** @brief Scan all warps of a CTA without synchronization
-  * 
-  * The warp-scan algorithm breaks a block of data into warp-sized chunks, and
-  * scans the chunks independently with a warp of threads each.  Because warps
-  * execute instructions in SIMD fashion, there is no need to synchronize in 
-  * order to share data within a warp (only across warps).  Also, in SIMD the 
-  * most efficient algorithm is a step-efficient algorithm.  Therefore, within
-  * each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps
-  * to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the 
-  * work-efficient tree-based algorithm described by Guy Blelloch [1990] that 
-  * takes 2 * log(N) steps and is in general more complex to implement.  
-  * Previous versions of CUDPP used the Blelloch algorithm.  For current GPUs, 
-  * the warp size is 32, so this takes five steps per warp.
-  *
-  * Each thread is responsible for a single element of the array to be scanned.
-  * Each thread inputs a single value to the scan via \a val and returns 
-  * its own scanned result element.  The threads of each warp cooperate 
-  * via the shared memory array \a s_data to scan WARP_SIZE elements.
-  *
-  * Template parameter \a maxlevel allows this warpscan to be performed on
-  * partial warps.  For example, if only the first 8 elements of each warp need
-  * to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.
-  *
-  * The computation uses 2 * WARP_SIZE elements of shared memory per warp to
-  * enable warps to offset beyond their input data and receive the identity 
-  * element without using any branch instructions.
-  * 
-  * \note s_data is declared volatile here to prevent the compiler from 
-  * optimizing away writes to shared memory, and ensure correct intrawarp 
-  * communication in the absence of __syncthreads.
-  *
-  * @return The result of the warp scan for the current thread
-  * @param[in] val The current threads's input to the scan
-  * @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE 
-  * elements used to compute the warp scans
-  */
-template<class T, class traits,int maxlevel>
-__device__ T warpscan(T val, volatile T* s_data)
-{
-    // The following is the same as 2 * 32 * warpId + threadInWarp = 
-    // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1))
-    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1));
-    s_data[idx] = traits::identity();
-    idx += WARP_SIZE;
-    T t = s_data[idx] = val;  __EMUSYNC;
-
-        // This code is needed because the warp size of device emulation
-        // is only 1 thread, so sync-less cooperation within a warp doesn't 
-        // work.
-#ifdef __DEVICE_EMULATION__
-    t = s_data[idx -  1]; __EMUSYNC; 
-    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
-    t = s_data[idx -  2]; __EMUSYNC; 
-    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
-    t = s_data[idx -  4]; __EMUSYNC; 
-    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
-    t = s_data[idx -  8]; __EMUSYNC; 
-    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
-    t = s_data[idx - 16]; __EMUSYNC; 
-    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
-#else
-    if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); }
-    if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); }
-    if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); }
-    if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); }
-    if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); }
-#endif
-
-    return s_data[idx-1];      // convert inclusive -> exclusive
-}
-
-/** @brief Perform a full CTA scan using the warp-scan algorithm
-  * 
-  * As described in the comment for warpscan(), the warp-scan algorithm breaks 
-  * a block of data into warp-sized chunks, and scans the chunks independently 
-  * with a warp of threads each.  To complete the scan, each warp <i>j</i> then 
-  * writes its last element to element <i>j</i> of a temporary shared array.
-  * Then a single warp exclusive-scans these "warp sums".  Finally, each thread
-  * adds the result of the warp sum scan to the result of the scan from the 
-  * first pass.
-  *
-  * Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan
-  * twice.
-  *
-  * @param x The first input value for the current thread
-  * @param y The second input value for the current thread
-  * @param s_data Temporary shared memory space of 2*CTA_SIZE elements for 
-  * performing the scan
-  */
-template <class T, class traits>
-__device__ void scanWarps(T x, T y, 
-                          T *s_data)
-{       
-    T val  = warpscan<T, traits, 4>(x, s_data);
-    __syncthreads(); 
-    T val2 = warpscan<T, traits, 4>(y, s_data);
-    
-    int idx = threadIdx.x;
-
-    if ((idx & 31)==31)
-    {
-        s_data[idx >> 5]                = traits::op(val, x);
-        s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y);
-    }
-    __syncthreads();
-
-#ifndef __DEVICE_EMULATION__
-    if (idx < 32)
-#endif
-    {
-        s_data[idx] = warpscan<T,traits,(LOG_CTA_SIZE-LOG_WARP_SIZE+1)>(s_data[idx], s_data);
-    }
-    __syncthreads();
-
-    val  = traits::op(val, s_data[idx >> 5]);
-
-    val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]);
-
-    __syncthreads();
-
-    s_data[idx] = val;
-    s_data[idx+blockDim.x] = val2;
-}
-
-/**
-* @brief CTA-level scan routine; scans s_data in shared memory in each thread block
-*
-* This function is the main CTA-level scan function.  It may be called by other 
-* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements.
-* Each thread is responsible for one element in each half of the input array.
-* \note This code is intended to be run on a CTA of 128 threads.  Other sizes are
-* untested.
-* 
-* @param[in] s_data The array to be scanned in shared memory
-* @param[out] d_blockSums Array of per-block sums
-* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum
-*/
-template <class T, class traits>
-__device__ void scanCTA(T            *s_data, 
-                        T            *d_blockSums, 
-                        unsigned int blockSumIndex)
-{
-    T val  = s_data[threadIdx.x];
-    T val2 = s_data[threadIdx.x + blockDim.x];
-    __syncthreads();     
-
-    scanWarps<T,traits>(val, val2, s_data);
-    __syncthreads();  
-
-    if (traits::writeSums() && threadIdx.x == blockDim.x - 1)
-    {
-        d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]);
-    }
-    
-    
-#ifdef __DEVICE_EMULATION__
-    // must sync in emulation mode when doing backward scans, because otherwise the 
-    // shared memory array will get reversed before the block sums are read!
-    if (traits::isBackward())
-        __syncthreads();
-#endif
-}
-
-
-/** @} */ // end scan functions
-/** @} */ // end cudpp_cta
+// ------------------------------------------------------------- 
+//  cuDPP -- CUDA Data Parallel Primitives library
+//  -------------------------------------------------------------
+//  $Revision: 5633 $
+//  $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+ 
+/**
+ * @file
+ * scan_cta.cu
+ *
+ * @brief CUDPP CTA-level scan routines
+ */
+
+/** \defgroup cudpp_cta CUDPP CTA-Level API
+  * The CUDPP CTA-Level API contains functions that run on the GPU 
+  * device.  These are CUDA \c __device__ functions that are called
+  * from within other CUDA device functions (typically 
+  * \link cudpp_kernel CUDPP Kernel-Level API\endlink functions).
+  * They are called CTA-level functions because they typically process
+  * s_data "owned" by each CTA within shared memory, and are agnostic of
+  * any other CTAs that may be running (or how many CTAs are running),
+  * other than to compute appropriate global memory addresses.
+  * @{
+  */
+
+/** @name Scan Functions
+* @{
+*/
+
+#include <cudpp_globals.h>
+#include <cudpp_util.h>
+#include <math.h>
+#include <cudpp.h>
+
+/**
+ * @brief Macro to insert necessary __syncthreads() in device emulation mode
+ */
+#ifdef __DEVICE_EMULATION__
+#define __EMUSYNC __syncthreads()
+#else
+#define __EMUSYNC
+#endif
+
+/** 
+  * @brief Template class containing compile-time parameters to the scan functions
+  *
+  * ScanTraits is passed as a template parameter to all scan functions.  By 
+  * using these compile-time functions we can enable generic code while 
+  * maintaining the highest performance.  This is crucial for the performance 
+  * of low-level workhorse algorithms like scan.
+  *
+  * @param T The datatype of the scan
+  * @param oper The ::CUDPPOperator to use for the scan (add, max, etc.)
+  * @param multiRow True if this is a multi-row scan
+  * @param unroll True if scan inner loops should be unrolled
+  * @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans)
+  * @param backward True if this is a backward scan
+  * @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements)
+  * @param exclusive True for exclusive scans, false for inclusive scans
+  */
+template <class T, CUDPPOperator oper, bool backward, bool exclusive,
+          bool multiRow, bool sums, bool fullBlock>
+class ScanTraits
+{
+public:
+    
+    //! Returns true if this is a backward scan
+    static __device__ bool isBackward()    { return backward; };
+    //! Returns true if this is an exclusive scan
+    static __device__ bool isExclusive()  { return exclusive; };
+    //! Returns true if this a multi-row scan.
+    static __device__ bool isMultiRow()    { return multiRow; };
+    //! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans)
+    static __device__ bool writeSums()     { return sums; };
+    //! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements
+    static __device__ bool isFullBlock()   { return fullBlock; };
+    
+        
+    //! The operator function used for the scan
+    static __device__ T op(const T a, const T b)
+    {
+        return Operator<T, oper>::op(a, b);
+    }  
+
+    //! The identity value used by the scan
+    static __device__ T identity() { return Operator<T, oper>::identity(); }
+};
+
+//! This is used to insert syncthreads to avoid perf loss caused by 128-bit 
+//! load overlap that happens on G80.  This gives about a 15% boost on scans on 
+//! G80.
+//! @todo Parameterize this in case this perf detail changes on future GPUs.
+#define DISALLOW_LOADSTORE_OVERLAP 1
+
+/**
+* @brief Handles loading input s_data from global memory to shared memory 
+* (vec4 version)
+*
+* Load a chunk of 8*blockDim.x elements from global memory into a 
+* shared memory array.  Each thread loads two T4 elements (where
+* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in 
+* thread local arrays (in registers), and writes the two total sums of the
+* vec4s into shared memory, where they will be cooperatively scanned with 
+* the other partial sums by all threads in the CTA.
+*
+* @param[out] s_out The output (shared) memory array
+* @param[out] threadScan0 Intermediate per-thread partial sums array 1
+* @param[out] threadScan1 Intermediate per-thread partial sums array 2
+* @param[in] d_in The input (device) memory array
+* @param[in] numElements The number of elements in the array being scanned
+* @param[in] iDataOffset the offset of the input array in global memory for this 
+* thread block
+* @param[out] ai The shared memory address for the thread's first element 
+* (returned for reuse)
+* @param[out] bi The shared memory address for the thread's second element 
+* (returned for reuse)
+* @param[out] aiDev The device memory address for this thread's first element 
+* (returned for reuse)
+* @param[out] biDev The device memory address for this thread's second element 
+* (returned for reuse)
+*/
+template <class T, class traits> 
+__device__ void loadSharedChunkFromMem4(T        *s_out,
+                                        T        threadScan0[4],
+                                        T        threadScan1[4],
+                                        const T  *d_in,
+                                        int      numElements, 
+                                        int      iDataOffset,
+                                        int      &ai, 
+                                        int      &bi, 
+                                        int      &aiDev, 
+                                        int      &biDev)
+{
+    int thid = threadIdx.x;
+    aiDev = iDataOffset + thid;
+    biDev = aiDev + blockDim.x;
+
+    // convert to 4-vector
+    typename typeToVector<T,4>::Result  tempData;
+    typename typeToVector<T,4>::Result* inData = (typename typeToVector<T,4>::Result*)d_in;
+
+    ai = thid;
+    bi = thid + blockDim.x;
+
+    // read into tempData;
+    if (traits::isBackward())
+    {
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements) 
+        {
+            tempData       = inData[aiDev];
+            threadScan0[3] = tempData.w;               
+            threadScan0[2] = traits::op(tempData.z, threadScan0[3]);
+            threadScan0[1] = traits::op(tempData.y, threadScan0[2]);
+            threadScan0[0] = s_out[ai] 
+                           = traits::op(tempData.x, threadScan0[1]);
+        }
+        else
+        {
+            threadScan0[3] = traits::identity();
+            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]);
+            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]);
+            threadScan0[0] = s_out[ai] 
+                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan0[1]);
+        }
+
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            tempData       = inData[biDev];
+            threadScan1[3] = tempData.w;
+            threadScan1[2] = traits::op(tempData.z, threadScan1[3]);
+            threadScan1[1] = traits::op(tempData.y, threadScan1[2]);
+            threadScan1[0] = s_out[bi] 
+                           = traits::op(tempData.x, threadScan1[1]);
+        }
+        else
+        {
+            threadScan1[3] = traits::identity();
+            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]);
+            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]);
+            threadScan1[0] = s_out[bi] 
+                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan1[1]);
+        }
+        __syncthreads();
+
+        // reverse s_data in shared memory
+        if (ai < CTA_SIZE)
+        {       
+            unsigned int leftIdx = ai;
+            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
+                
+            if (leftIdx < rightIdx) 
+            {
+                T tmp           = s_out[leftIdx];
+                s_out[leftIdx]  = s_out[rightIdx];
+                s_out[rightIdx] = tmp;
+            }
+        }
+        __syncthreads();
+    }
+    else
+    {
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            tempData       = inData[aiDev];
+            threadScan0[0] = tempData.x;           
+            threadScan0[1] = traits::op(tempData.y, threadScan0[0]);
+            threadScan0[2] = traits::op(tempData.z, threadScan0[1]);
+            threadScan0[3] = s_out[ai] 
+                           = traits::op(tempData.w, threadScan0[2]);
+        }
+        else
+        {
+            threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity();
+            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]);
+            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]);
+            threadScan0[3] = s_out[ai] 
+                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]);
+        }
+
+        
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            tempData       = inData[biDev];
+            threadScan1[0] = tempData.x;           
+            threadScan1[1] = traits::op(tempData.y, threadScan1[0]);
+            threadScan1[2] = traits::op(tempData.z, threadScan1[1]);
+            threadScan1[3] = s_out[bi] 
+                           = traits::op(tempData.w, threadScan1[2]);
+        }
+        else
+        {
+            threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity();
+            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]);
+            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]);
+            threadScan1[3] = s_out[bi] 
+                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]);
+        }  
+        __syncthreads();
+    }
+}
+
+
+/**
+* @brief Handles storing result s_data from shared memory to global memory 
+* (vec4 version)
+*
+* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory 
+* into a device memory array.  Each thread stores reads two elements from shared
+* memory, adds them to the intermediate sums computed in 
+* loadSharedChunkFromMem4(), and writes two T4 elements (where
+* T4 is, e.g. int4 or float4) to global memory.
+*
+* @param[out] d_out The output (device) memory array
+* @param[in] threadScan0 Intermediate per-thread partial sums array 1
+* (contents computed in loadSharedChunkFromMem4())
+* @param[in] threadScan1 Intermediate per-thread partial sums array 2
+* (contents computed in loadSharedChunkFromMem4())
+* @param[in] s_in The input (shared) memory array
+* @param[in] numElements The number of elements in the array being scanned
+* @param[in] oDataOffset the offset of the output array in global memory 
+* for this thread block
+* @param[in] ai The shared memory address for the thread's first element 
+* (computed in loadSharedChunkFromMem4())
+* @param[in] bi The shared memory address for the thread's second element 
+* (computed in loadSharedChunkFromMem4())
+* @param[in] aiDev The device memory address for this thread's first element 
+* (computed in loadSharedChunkFromMem4())
+* @param[in] biDev The device memory address for this thread's second element 
+* (computed in loadSharedChunkFromMem4())
+*/
+template <class T, class traits>
+__device__ void storeSharedChunkToMem4(T   *d_out,
+                                       T   threadScan0[4],
+                                       T   threadScan1[4],
+                                       T   *s_in,
+                                       int numElements, 
+                                       int oDataOffset,
+                                       int ai, 
+                                       int bi, 
+                                       int aiDev, 
+                                       int biDev)
+{
+    // Convert to 4-vector
+    typename typeToVector<T,4>::Result tempData;
+    typename typeToVector<T,4>::Result* outData = (typename typeToVector<T,4>::Result*)d_out;
+
+    // write results to global memory
+    if (traits::isBackward())
+    {   
+        if (ai < CTA_SIZE)
+        {
+
+            unsigned int leftIdx = ai;
+            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
+            
+            if (leftIdx < rightIdx) 
+            {
+                T tmp = s_in[leftIdx];
+                s_in[leftIdx] = s_in[rightIdx];
+                s_in[rightIdx] = tmp;
+            }
+        }
+        __syncthreads();
+
+        T temp = s_in[ai];
+
+        if (traits::isExclusive())
+        {
+            tempData.w = temp;
+            tempData.z = traits::op(temp, threadScan0[3]);
+            tempData.y = traits::op(temp, threadScan0[2]);
+            tempData.x = traits::op(temp, threadScan0[1]);
+        }
+        else
+        {
+            tempData.w = traits::op(temp, threadScan0[3]);
+            tempData.z = traits::op(temp, threadScan0[2]);
+            tempData.y = traits::op(temp, threadScan0[1]);
+            tempData.x = traits::op(temp, threadScan0[0]);
+        }
+
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            outData[aiDev] = tempData;
+        }
+        else
+        {
+            if (i   < numElements) { d_out[i]   = tempData.x;
+            if (i+1 < numElements) { d_out[i+1] = tempData.y;
+            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}     
+        }
+
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        temp = s_in[bi];
+
+        if (traits::isExclusive())
+        {
+            tempData.w = temp;
+            tempData.z = traits::op(temp, threadScan1[3]);
+            tempData.y = traits::op(temp, threadScan1[2]);
+            tempData.x = traits::op(temp, threadScan1[1]);
+        }
+        else
+        {
+            tempData.w = traits::op(temp, threadScan1[3]);
+            tempData.z = traits::op(temp, threadScan1[2]);
+            tempData.y = traits::op(temp, threadScan1[1]);
+            tempData.x = traits::op(temp, threadScan1[0]);
+        }
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            outData[biDev] = tempData;
+        }
+        else
+        {
+            if (i   < numElements) { d_out[i]   = tempData.x;
+            if (i+1 < numElements) { d_out[i+1] = tempData.y;
+            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}     
+        }
+    }
+    else
+    {
+        T temp;
+        temp = s_in[ai]; 
+
+        if (traits::isExclusive())
+        {
+            tempData.x = temp;
+            tempData.y = traits::op(temp, threadScan0[0]);
+            tempData.z = traits::op(temp, threadScan0[1]);
+            tempData.w = traits::op(temp, threadScan0[2]);
+        }
+        else
+        {
+            tempData.x = traits::op(temp, threadScan0[0]);
+            tempData.y = traits::op(temp, threadScan0[1]);
+            tempData.z = traits::op(temp, threadScan0[2]);
+            tempData.w = traits::op(temp, threadScan0[3]);
+        }
+
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {                       
+            outData[aiDev] = tempData; 
+        }
+        else 
+        {       
+            // we can't use vec4 because the original array isn't a multiple of 
+            // 4 elements
+            if ( i    < numElements) { d_out[i]   = tempData.x;
+            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
+            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
+        }
+
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        temp       = s_in[bi]; 
+
+        if (traits::isExclusive())
+        {
+            tempData.x = temp;
+            tempData.y = traits::op(temp, threadScan1[0]);
+            tempData.z = traits::op(temp, threadScan1[1]);
+            tempData.w = traits::op(temp, threadScan1[2]);
+        }
+        else
+        {
+            tempData.x = traits::op(temp, threadScan1[0]);
+            tempData.y = traits::op(temp, threadScan1[1]);
+            tempData.z = traits::op(temp, threadScan1[2]);
+            tempData.w = traits::op(temp, threadScan1[3]);
+        }
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            outData[biDev] = tempData;
+        }
+        else 
+        {
+            // we can't use vec4 because the original array isn't a multiple of 
+            // 4 elements
+            if ( i    < numElements) { d_out[i]   = tempData.x;
+            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
+            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
+        }
+    }
+}
+
+/** @brief Scan all warps of a CTA without synchronization
+  * 
+  * The warp-scan algorithm breaks a block of data into warp-sized chunks, and
+  * scans the chunks independently with a warp of threads each.  Because warps
+  * execute instructions in SIMD fashion, there is no need to synchronize in 
+  * order to share data within a warp (only across warps).  Also, in SIMD the 
+  * most efficient algorithm is a step-efficient algorithm.  Therefore, within
+  * each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps
+  * to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the 
+  * work-efficient tree-based algorithm described by Guy Blelloch [1990] that 
+  * takes 2 * log(N) steps and is in general more complex to implement.  
+  * Previous versions of CUDPP used the Blelloch algorithm.  For current GPUs, 
+  * the warp size is 32, so this takes five steps per warp.
+  *
+  * Each thread is responsible for a single element of the array to be scanned.
+  * Each thread inputs a single value to the scan via \a val and returns 
+  * its own scanned result element.  The threads of each warp cooperate 
+  * via the shared memory array \a s_data to scan WARP_SIZE elements.
+  *
+  * Template parameter \a maxlevel allows this warpscan to be performed on
+  * partial warps.  For example, if only the first 8 elements of each warp need
+  * to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.
+  *
+  * The computation uses 2 * WARP_SIZE elements of shared memory per warp to
+  * enable warps to offset beyond their input data and receive the identity 
+  * element without using any branch instructions.
+  * 
+  * \note s_data is declared volatile here to prevent the compiler from 
+  * optimizing away writes to shared memory, and ensure correct intrawarp 
+  * communication in the absence of __syncthreads.
+  *
+  * @return The result of the warp scan for the current thread
+  * @param[in] val The current threads's input to the scan
+  * @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE 
+  * elements used to compute the warp scans
+  */
+template<class T, class traits,int maxlevel>
+__device__ T warpscan(T val, volatile T* s_data)
+{
+    // The following is the same as 2 * 32 * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1))
+    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1));
+    s_data[idx] = traits::identity();
+    idx += WARP_SIZE;
+    T t = s_data[idx] = val;  __EMUSYNC;
+
+        // This code is needed because the warp size of device emulation
+        // is only 1 thread, so sync-less cooperation within a warp doesn't 
+        // work.
+#ifdef __DEVICE_EMULATION__
+    t = s_data[idx -  1]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx -  2]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx -  4]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx -  8]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx - 16]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+#else
+    if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); }
+    if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); }
+    if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); }
+    if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); }
+    if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); }
+#endif
+
+    return s_data[idx-1];      // convert inclusive -> exclusive
+}
+
+/** @brief Perform a full CTA scan using the warp-scan algorithm
+  * 
+  * As described in the comment for warpscan(), the warp-scan algorithm breaks 
+  * a block of data into warp-sized chunks, and scans the chunks independently 
+  * with a warp of threads each.  To complete the scan, each warp <i>j</i> then 
+  * writes its last element to element <i>j</i> of a temporary shared array.
+  * Then a single warp exclusive-scans these "warp sums".  Finally, each thread
+  * adds the result of the warp sum scan to the result of the scan from the 
+  * first pass.
+  *
+  * Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan
+  * twice.
+  *
+  * @param x The first input value for the current thread
+  * @param y The second input value for the current thread
+  * @param s_data Temporary shared memory space of 2*CTA_SIZE elements for 
+  * performing the scan
+  */
+template <class T, class traits>
+__device__ void scanWarps(T x, T y, 
+                          T *s_data)
+{       
+    T val  = warpscan<T, traits, 4>(x, s_data);
+    __syncthreads(); 
+    T val2 = warpscan<T, traits, 4>(y, s_data);
+    
+    int idx = threadIdx.x;
+
+    if ((idx & 31)==31)
+    {
+        s_data[idx >> 5]                = traits::op(val, x);
+        s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y);
+    }
+    __syncthreads();
+
+#ifndef __DEVICE_EMULATION__
+    if (idx < 32)
+#endif
+    {
+        s_data[idx] = warpscan<T,traits,(LOG_CTA_SIZE-LOG_WARP_SIZE+1)>(s_data[idx], s_data);
+    }
+    __syncthreads();
+
+    val  = traits::op(val, s_data[idx >> 5]);
+
+    val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]);
+
+    __syncthreads();
+
+    s_data[idx] = val;
+    s_data[idx+blockDim.x] = val2;
+}
+
+/**
+* @brief CTA-level scan routine; scans s_data in shared memory in each thread block
+*
+* This function is the main CTA-level scan function.  It may be called by other 
+* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements.
+* Each thread is responsible for one element in each half of the input array.
+* \note This code is intended to be run on a CTA of 128 threads.  Other sizes are
+* untested.
+* 
+* @param[in] s_data The array to be scanned in shared memory
+* @param[out] d_blockSums Array of per-block sums
+* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum
+*/
+template <class T, class traits>
+__device__ void scanCTA(T            *s_data, 
+                        T            *d_blockSums, 
+                        unsigned int blockSumIndex)
+{
+    T val  = s_data[threadIdx.x];
+    T val2 = s_data[threadIdx.x + blockDim.x];
+    __syncthreads();     
+
+    scanWarps<T,traits>(val, val2, s_data);
+    __syncthreads();  
+
+    if (traits::writeSums() && threadIdx.x == blockDim.x - 1)
+    {
+        d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]);
+    }
+    
+    
+#ifdef __DEVICE_EMULATION__
+    // must sync in emulation mode when doing backward scans, because otherwise the 
+    // shared memory array will get reversed before the block sums are read!
+    if (traits::isBackward())
+        __syncthreads();
+#endif
+}
+
+
+/** @} */ // end scan functions
+/** @} */ // end cudpp_cta
diff --git a/lib/gpu/cudpp_mini/cudpp.cpp b/lib/gpu/cudpp_mini/cudpp.cpp
index 7a0dfac9f5..da5975406d 100644
--- a/lib/gpu/cudpp_mini/cudpp.cpp
+++ b/lib/gpu/cudpp_mini/cudpp.cpp
@@ -7,7 +7,7 @@
 // This source code is distributed under the terms of license.txt in
 // the root directory of this source distribution.
 // ------------------------------------------------------------- 
-
+ 
 /**
  * @file
  * cudpp.cpp
diff --git a/lib/gpu/cudpp_mini/cudpp.h b/lib/gpu/cudpp_mini/cudpp.h
index 3093681523..f04c0272be 100644
--- a/lib/gpu/cudpp_mini/cudpp.h
+++ b/lib/gpu/cudpp_mini/cudpp.h
@@ -7,7 +7,7 @@
 // This source code is distributed under the terms of license.txt in
 // the root directory of this source distribution.
 // ------------------------------------------------------------- 
-
+ 
 /**
  * @file
  * cudpp.h
diff --git a/lib/gpu/cudpp_mini/cudpp_globals.h b/lib/gpu/cudpp_mini/cudpp_globals.h
index 3d18a5727c..2c89665e9b 100644
--- a/lib/gpu/cudpp_mini/cudpp_globals.h
+++ b/lib/gpu/cudpp_mini/cudpp_globals.h
@@ -7,7 +7,7 @@
 // This source code is distributed under the terms of license.txt in
 // the root directory of this source distribution.
 // ------------------------------------------------------------- 
-
+ 
 /**
  * @file
  * cudpp_globals.h
diff --git a/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp b/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
index 5bf3a55d98..904d101b3b 100644
--- a/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
+++ b/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
@@ -8,7 +8,7 @@
 // in the root directory of this source distribution.
 // ------------------------------------------------------------- 
 #include "cudpp_maximal_launch.h"
-
+ 
 inline size_t min(size_t x, size_t y)
 {
     return (x <= y) ? x : y;
diff --git a/lib/gpu/cudpp_mini/cudpp_maximal_launch.h b/lib/gpu/cudpp_mini/cudpp_maximal_launch.h
index 54e31c352a..e601c3118b 100644
--- a/lib/gpu/cudpp_mini/cudpp_maximal_launch.h
+++ b/lib/gpu/cudpp_mini/cudpp_maximal_launch.h
@@ -9,7 +9,7 @@
 // ------------------------------------------------------------- 
 #ifndef _MAXIMAL_LAUNCH_H_
 #define _MAXIMAL_LAUNCH_H_
-
+ 
 #include "cuda_runtime.h"
 
 extern "C"
diff --git a/lib/gpu/cudpp_mini/cudpp_plan.cpp b/lib/gpu/cudpp_mini/cudpp_plan.cpp
index 62d6a3da69..b64b7b37ff 100644
--- a/lib/gpu/cudpp_mini/cudpp_plan.cpp
+++ b/lib/gpu/cudpp_mini/cudpp_plan.cpp
@@ -7,7 +7,7 @@
 // This source code is distributed under the terms of license.txt
 // in the root directory of this source distribution.
 // ------------------------------------------------------------- 
-
+ 
 #include "cudpp.h"
 #include "cudpp_plan_manager.h"
 #include "cudpp_scan.h"
diff --git a/lib/gpu/cudpp_mini/cudpp_plan.h b/lib/gpu/cudpp_mini/cudpp_plan.h
index 4e4b3bafb7..3ec866d8d2 100644
--- a/lib/gpu/cudpp_mini/cudpp_plan.h
+++ b/lib/gpu/cudpp_mini/cudpp_plan.h
@@ -9,7 +9,7 @@
 // ------------------------------------------------------------- 
 #ifndef __CUDPP_PLAN_H__
 #define __CUDPP_PLAN_H__
-
+ 
 typedef void* KernelPointer;
 
 extern "C" size_t getNumCTAs(KernelPointer kernel);
diff --git a/lib/gpu/cudpp_mini/cudpp_plan_manager.cpp b/lib/gpu/cudpp_mini/cudpp_plan_manager.cpp
index 492b10cb04..dd9d6eb9db 100644
--- a/lib/gpu/cudpp_mini/cudpp_plan_manager.cpp
+++ b/lib/gpu/cudpp_mini/cudpp_plan_manager.cpp
@@ -11,7 +11,7 @@
 #include "cudpp_plan.h"
 #include "cudpp_plan_manager.h"
 #include "cudpp_maximal_launch.h"
-
+ 
 typedef void* KernelPointer;
 
 extern "C" size_t getNumCTAs(KernelPointer kernel)
diff --git a/lib/gpu/cudpp_mini/cudpp_plan_manager.h b/lib/gpu/cudpp_mini/cudpp_plan_manager.h
index fcf33a43ec..a55a1f0d25 100644
--- a/lib/gpu/cudpp_mini/cudpp_plan_manager.h
+++ b/lib/gpu/cudpp_mini/cudpp_plan_manager.h
@@ -9,7 +9,7 @@
 // ------------------------------------------------------------- 
 #ifndef __CUDPP_PLAN_MANAGER_H__
 #define __CUDPP_PLAN_MANAGER_H__
-
+ 
 #include <map>
 
 class CUDPPPlan;
diff --git a/lib/gpu/cudpp_mini/cudpp_radixsort.h b/lib/gpu/cudpp_mini/cudpp_radixsort.h
index eee009cedb..643da86517 100644
--- a/lib/gpu/cudpp_mini/cudpp_radixsort.h
+++ b/lib/gpu/cudpp_mini/cudpp_radixsort.h
@@ -9,7 +9,7 @@
 // ------------------------------------------------------------- 
 #ifndef   __RADIXSORT_H__
 #define   __RADIXSORT_H__
-
+ 
 #define SORT_CTA_SIZE 256 //This CTA_SIZE must equal 16 * number of radices 
 
 #include "cudpp_globals.h"
diff --git a/lib/gpu/cudpp_mini/cudpp_scan.h b/lib/gpu/cudpp_mini/cudpp_scan.h
index 6b55d80f70..4a5ef348ff 100644
--- a/lib/gpu/cudpp_mini/cudpp_scan.h
+++ b/lib/gpu/cudpp_mini/cudpp_scan.h
@@ -7,7 +7,7 @@
 // This source code is distributed under the terms of license.txt 
 // in the root directory of this source distribution.
 // ------------------------------------------------------------- 
-
+ 
 /**
 * @file
 * cudpp_scan.h
diff --git a/lib/gpu/cudpp_mini/cudpp_util.h b/lib/gpu/cudpp_mini/cudpp_util.h
index 8815b5bf5f..c25a384f29 100644
--- a/lib/gpu/cudpp_mini/cudpp_util.h
+++ b/lib/gpu/cudpp_mini/cudpp_util.h
@@ -7,7 +7,7 @@
 // This source code is distributed under the terms of license.txt in
 // the root directory of this source distribution.
 // ------------------------------------------------------------- 
-
+ 
 /**
  * @file
  * cudpp_util.h
diff --git a/lib/gpu/cudpp_mini/cutil.h b/lib/gpu/cudpp_mini/cutil.h
index 390b40615c..3df93fb886 100644
--- a/lib/gpu/cudpp_mini/cutil.h
+++ b/lib/gpu/cudpp_mini/cutil.h
@@ -26,7 +26,7 @@
 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
 * source code with only those rights set forth herein.
 */
-
+ 
 
 /* CUda UTility Library */
 
diff --git a/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu b/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu
index ac66b9a9f2..10b4f38b18 100644
--- a/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu
+++ b/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu
@@ -1,868 +1,868 @@
-// -------------------------------------------------------------
-// CUDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision$
-// $Date$
-// ------------------------------------------------------------- 
-// This source code is distributed under the terms of license.txt 
-// in the root directory of this source distribution.
-// ------------------------------------------------------------- 
-
-#include "cudpp_radixsort.h"
-#include <cudpp_globals.h>
-#include "sharedmem.h"
-#include "cta/radixsort_cta.cu"
-
-#ifdef __DEVICE_EMULATION__
-#define __EMUSYNC  __syncthreads()
-#else
-#define __EMUSYNC
-#endif
-
-/**
- * @file
- * radixsort_app.cu
- *   
- * @brief CUDPP kernel-level radix sorting routines
- */
-
-/** \addtogroup cudpp_kernel
-  * @{
- */
-
-/** @name RadixSort Functions
- * @{
- */
-
-
-
-typedef unsigned int uint;
-
-/** @brief And empty kernel used to reset CTA issue hardware
- **/
-__global__ void emptyKernel() {}
-
-
-/** @brief Does special binary arithmetic before sorting floats
- * 
- * Uses floatFlip function to flip bits.
- * @param[in,out] values  Values to be manipulated
- * @param[in] numValues Number of values to be flipped 
- **/
-
-__global__ void 
-LAUNCH_BOUNDS(SORT_CTA_SIZE)
-flipFloats(uint *values, uint numValues)
-{
-    uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; 
-    if (index < numValues) values[index] = floatFlip<true>(values[index]);
-    index += blockDim.x;
-    if (index < numValues) values[index] = floatFlip<true>(values[index]);
-    index += blockDim.x;
-    if (index < numValues) values[index] = floatFlip<true>(values[index]);
-    index += blockDim.x;
-    if (index < numValues) values[index] = floatFlip<true>(values[index]);
-}
-
-/** @brief Undoes the flips from flipFloats
- * 
- * Uses floatUnflip function to unflip bits.
- * @param[in,out] values  Values to be manipulated
- * @param[in] numValues Number of values to be unflipped 
- **/
-__global__ void 
-LAUNCH_BOUNDS(SORT_CTA_SIZE)
-unflipFloats(uint *values, uint numValues)
-{
-    uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; 
-    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
-    index += blockDim.x;
-    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
-    index += blockDim.x;
-    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
-    index += blockDim.x;
-    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
-}
-
-
-/** @brief Optimization for sorts of WARP_SIZE or fewer elements
- * 
- * @param[in,out] keys  Keys to be sorted.
- * @param[in,out] values Associated values to be sorted (through keys).
- * @param[in] numElements Number of elements in the sort.
- */
-template <bool flip>
-__global__ 
-LAUNCH_BOUNDS(WARP_SIZE)
-void radixSortSingleWarp(uint *keys, 
-                         uint *values, 
-                         uint numElements)
-{
-    volatile __shared__ uint sKeys[WARP_SIZE]; //remove class distinctions
-    volatile __shared__ uint sValues[WARP_SIZE];
-    volatile __shared__ uint sFlags[WARP_SIZE];
-
-    sKeys[threadIdx.x]   = floatFlip<flip>(keys[threadIdx.x]);
-    sValues[threadIdx.x] = values[threadIdx.x];
-    
-    __EMUSYNC; // emulation only
-
-    for(uint i = 1; i < numElements; i++)
-    {
-        uint key_i = sKeys[i];
-        uint val_i = sValues[i];
-        
-        sFlags[threadIdx.x] = 0;
-      
-        uint temp, tempval;
-        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
-        {
-            temp = sKeys[threadIdx.x];
-            tempval = sValues[threadIdx.x];
-            sFlags[threadIdx.x] = 1;
-
-#ifdef __DEVICE_EMULATION__
-        }
-        __EMUSYNC;
-        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
-        {
-#endif
-            sKeys[threadIdx.x + 1] = temp;
-            sValues[threadIdx.x + 1] = tempval;
-            sFlags[threadIdx.x + 1] = 0;
-        }
-
-        
-        if(sFlags[threadIdx.x] == 1 )
-        {
-            sKeys[threadIdx.x] = key_i;
-            sValues[threadIdx.x] = val_i;
-        }
-
-        __EMUSYNC; // emulation only
-
-    }
-    keys[threadIdx.x]   = floatUnflip<flip>(sKeys[threadIdx.x]);
-    values[threadIdx.x] = sValues[threadIdx.x];
-}
-
-
-/** @brief Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.
- *
- * @param[in,out] keys Keys to be sorted
- * @param[in] numElements Total number of elements to be sorted
-**/
-
-template <bool flip>
-__global__ 
-LAUNCH_BOUNDS(WARP_SIZE)
-void radixSortSingleWarpKeysOnly(uint *keys, 
-                                 uint numElements)
-{
-    volatile __shared__ uint sKeys[WARP_SIZE];
-    volatile __shared__ uint sFlags[WARP_SIZE];
-
-    sKeys[threadIdx.x]   = floatFlip<flip>(keys[threadIdx.x]);
-    
-    __EMUSYNC; // emulation only
-
-    for(uint i = 1; i < numElements; i++)
-    {
-        uint key_i = sKeys[i];
-        
-        sFlags[threadIdx.x] = 0;
-        
-        uint temp;
-        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
-        {
-            temp = sKeys[threadIdx.x];
-            sFlags[threadIdx.x] = 1;
-#ifdef __DEVICE_EMULATION__
-        }
-        __EMUSYNC;
-        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
-        {
-#endif
-            sKeys[threadIdx.x + 1] = temp;
-            sFlags[threadIdx.x + 1] = 0;
-        }
-        if(sFlags[threadIdx.x] == 1 )
-        {
-            sKeys[threadIdx.x] = key_i;
-        }
-
-        __EMUSYNC; // emulation only
-
-    }
-    keys[threadIdx.x]   = floatUnflip<flip>(sKeys[threadIdx.x]);
-}
-
-/** @brief sorts all blocks of data independently in shared memory.  
-* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
-* 
-* The radix sort is done in two stages.  This stage calls radixSortBlock on each 
-* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
-* 
-* Template parameters are used to generate efficient code for various special cases
-* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
-* differently than arrays that are not.  "flip" is used to only compile in the
-* float flip code when float keys are used.  "loop" is used when persistent CTAs
-* are used. 
-*
-* By persistent CTAs we mean that we launch only as many thread blocks as can 
-* be resident in the GPU and no more, rather than launching as many threads as
-* we have elements. Persistent CTAs loop over blocks of elements until all work
-* is complete.  This can be faster in some cases.  In our tests it is faster
-* for large sorts (and the threshold is higher on compute version 1.1 and earlier
-* GPUs than it is on compute version 1.2 GPUs.
-* 
-* @param[out] keysOut Output of sorted keys 
-* @param[out] valuesOut Output of associated values 
-* @param[in]  keysIn Input of unsorted keys in GPU 
-* @param[in]  valuesIn Input of associated input values 
-* @param[in]  numElements Total number of elements to sort
-* @param[in]  totalBlocks The number of blocks of data to sort
-*/
-template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
-__global__ void 
-LAUNCH_BOUNDS(SORT_CTA_SIZE)
-radixSortBlocks(uint4* keysOut, uint4* valuesOut, 
-                                uint4* keysIn, uint4* valuesIn, 
-                                uint numElements, uint totalBlocks)
-{
-    extern __shared__ uint4 sMem[];
-
-    uint4 key, value;
-
-
-    uint blockId = blockIdx.x;
-
-    while (!loop || blockId < totalBlocks)
-    {
-        uint i = blockId * blockDim.x + threadIdx.x;
-        uint idx = i << 2;
-
-        // handle non-full last block if array is not multiple of 1024 numElements
-        if (!fullBlocks && idx+3 >= numElements)
-        {
-            if (idx >= numElements)
-            {
-                key   = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
-                value = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
-            }
-            else
-            {
-                // for non-full block, we handle uint1 values instead of uint4
-                uint *keys1    = (uint*)keysIn;
-                uint *values1  = (uint*)valuesIn;
-
-                key.x = (idx   < numElements) ? floatFlip<flip>(keys1[idx])   : UINT_MAX;
-                key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
-                key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
-                key.w = UINT_MAX;
-
-                value.x = (idx   < numElements) ? values1[idx]   : UINT_MAX;
-                value.y = (idx+1 < numElements) ? values1[idx+1] : UINT_MAX;
-                value.z = (idx+2 < numElements) ? values1[idx+2] : UINT_MAX;
-                value.w = UINT_MAX;
-            }
-        }
-        else
-        {
-            key = keysIn[i];
-            value = valuesIn[i];
-
-            if (flip)
-            {
-                key.x = floatFlip<flip>(key.x);
-                key.y = floatFlip<flip>(key.y);
-                key.z = floatFlip<flip>(key.z);
-                key.w = floatFlip<flip>(key.w);
-            }
-        }
-        __syncthreads();
-        radixSortBlock<nbits, startbit>(key, value);
-
-        // handle non-full last block if array is not multiple of 1024 numElements
-        if(!fullBlocks && idx+3 >= numElements)
-        {
-            if (idx < numElements) 
-            {
-                // for non-full block, we handle uint1 values instead of uint4
-                uint *keys1   = (uint*)keysOut;
-                uint *values1 = (uint*)valuesOut;
-
-                keys1[idx]   = key.x;
-                values1[idx] = value.x;
-
-                if (idx + 1 < numElements)
-                {
-                    keys1[idx + 1]   = key.y;
-                    values1[idx + 1] = value.y;
-
-                    if (idx + 2 < numElements)
-                    {
-                        keys1[idx + 2]   = key.z;
-                        values1[idx + 2] = value.z;
-                    }
-                }
-            }
-        }
-        else
-        {
-            keysOut[i]   = key;
-            valuesOut[i] = value;
-        }
-
-        if (loop)        
-            blockId += gridDim.x;
-        else
-            break;            
-    }
-}
-
-/** @brief Computes the number of keys of each radix in each block stores offset.
-*
-* Given an array with blocks sorted according to a 4-bit radix group, each 
-* block counts the number of keys that fall into each radix in the group, and 
-* finds the starting offset of each radix in the block.  It then writes the radix 
-* counts to the counters array, and the starting offsets to the blockOffsets array.
-*
-* Template parameters are used to generate efficient code for various special cases
-* For example, we have to handle arrays that are a multiple of the block size 
-* (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
-* CTAs are used. 
-*
-* By persistent CTAs we mean that we launch only as many thread blocks as can 
-* be resident in the GPU and no more, rather than launching as many threads as
-* we have elements. Persistent CTAs loop over blocks of elements until all work
-* is complete.  This can be faster in some cases.  In our tests it is faster
-* for large sorts (and the threshold is higher on compute version 1.1 and earlier
-* GPUs than it is on compute version 1.2 GPUs.
-* 
-* @param[in] keys Input keys
-* @param[out] counters Radix count for each block
-* @param[out] blockOffsets The offset address for each block
-* @param[in] numElements Total number of elements
-* @param[in] totalBlocks Total number of blocks
-**/
-template<uint startbit, bool fullBlocks, bool loop>
-__global__ void 
-LAUNCH_BOUNDS(SORT_CTA_SIZE)
-findRadixOffsets(uint2 *keys, 
-                 uint  *counters, 
-                 uint  *blockOffsets, 
-                 uint   numElements,
-                 uint   totalBlocks)
-{
-    extern __shared__ uint sRadix1[];
-    __shared__ uint  sStartPointers[16];
-
-    uint blockId = blockIdx.x;   
-
-    while (!loop || blockId < totalBlocks)
-    {
-        uint2 radix2;
-
-        uint i       = blockId * blockDim.x + threadIdx.x;
-
-        // handle non-full last block if array is not multiple of 1024 numElements
-        if(!fullBlocks && ((i + 1) << 1 ) > numElements )
-        {
-            // handle uint1 rather than uint2 for non-full blocks
-            uint *keys1 = (uint*)keys;
-            uint j = i << 1; 
-
-            radix2.x = (j < numElements) ? keys1[j] : UINT_MAX; 
-            j++;
-            radix2.y = (j < numElements) ? keys1[j] : UINT_MAX;
-        }
-        else
-        {
-            radix2 = keys[i];
-        }
-
-        sRadix1[2 * threadIdx.x]     = (radix2.x >> startbit) & 0xF;
-        sRadix1[2 * threadIdx.x + 1] = (radix2.y >> startbit) & 0xF;
-
-        // Finds the position where the sRadix1 entries differ and stores start 
-        // index for each radix.
-        if(threadIdx.x < 16) 
-        { 
-            sStartPointers[threadIdx.x] = 0; 
-        }
-        __syncthreads();
-
-        if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) 
-        {
-            sStartPointers[sRadix1[threadIdx.x]] = threadIdx.x;
-        }
-        if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]) 
-        {
-            sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE]] = threadIdx.x + SORT_CTA_SIZE;
-        }
-        __syncthreads();
-
-        if(threadIdx.x < 16) 
-        {
-            blockOffsets[blockId*16 + threadIdx.x] = sStartPointers[threadIdx.x];
-        }
-        __syncthreads();
-
-        // Compute the sizes of each block.
-        if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) 
-        {
-            sStartPointers[sRadix1[threadIdx.x - 1]] = 
-                threadIdx.x - sStartPointers[sRadix1[threadIdx.x - 1]];
-        }
-        if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1] ) 
-        {
-            sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]] = 
-                threadIdx.x + SORT_CTA_SIZE - sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]];
-        }
-
-
-        if(threadIdx.x == SORT_CTA_SIZE - 1) 
-        {
-            sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]] = 
-                2 * SORT_CTA_SIZE - sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]];
-        }
-        __syncthreads();
-
-        if(threadIdx.x < 16) 
-        {
-            counters[threadIdx.x * totalBlocks + blockId] = 
-                sStartPointers[threadIdx.x];
-        }
-
-        if (loop)
-            blockId += gridDim.x;
-        else
-            break;
-    }
-}
-
-
-/**@brief Reorders data in the global array.
-*
-* reorderData shuffles data in the array globally after the radix
-* offsets have been found. On compute version 1.1 and earlier GPUs, this code depends 
-* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
-* 
-* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
-* that all writes are coalesced using extra work in the kernel.  On later
-* GPUs coalescing rules have been relaxed, so this extra overhead hurts 
-* performance.  On these GPUs we set manualCoalesce=false and directly store
-* the results.
-*
-* Template parameters are used to generate efficient code for various special cases
-* For example, we have to handle arrays that are a multiple of the block size 
-* (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
-* CTAs are used. 
-*
-* By persistent CTAs we mean that we launch only as many thread blocks as can 
-* be resident in the GPU and no more, rather than launching as many threads as
-* we have elements. Persistent CTAs loop over blocks of elements until all work
-* is complete.  This can be faster in some cases.  In our tests it is faster
-* for large sorts (and the threshold is higher on compute version 1.1 and earlier
-* GPUs than it is on compute version 1.2 GPUs.
-*
-* @param[out] outKeys Output of sorted keys 
-* @param[out] outValues Output of associated values 
-* @param[in] keys Input of unsorted keys in GPU 
-* @param[in] values Input of associated input values 
-* @param[in] blockOffsets The offset address for each block
-* @param[in] offsets Address of each radix within each block
-* @param[in] sizes Number of elements in a block
-* @param[in] numElements Total number of elements
-* @param[in] totalBlocks Total number of data blocks to process
-*
-* @todo Args that are const below should be prototyped as const
-**/
-template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
-__global__ void 
-LAUNCH_BOUNDS(SORT_CTA_SIZE)
-reorderData(uint  *outKeys, 
-            uint  *outValues, 
-            uint2 *keys, 
-            uint2 *values, 
-            uint  *blockOffsets, 
-            uint  *offsets, 
-            uint  *sizes, 
-            uint   numElements,
-            uint   totalBlocks)
-{
-    __shared__ uint2 sKeys2[SORT_CTA_SIZE];
-    __shared__ uint2 sValues2[SORT_CTA_SIZE];
-    __shared__ uint sOffsets[16];
-    __shared__ uint sBlockOffsets[16];
-
-    uint *sKeys1   = (uint*)sKeys2; 
-    uint *sValues1 = (uint*)sValues2; 
-
-    uint blockId = blockIdx.x;   
-
-    while (!loop || blockId < totalBlocks)
-    {
-        uint i = blockId * blockDim.x + threadIdx.x;
-
-        // handle non-full last block if array is not multiple of 1024 numElements
-        if(!fullBlocks && (((i + 1) << 1) > numElements))
-        {
-            uint *keys1   = (uint*)keys;
-            uint *values1 = (uint*)values;
-            uint j = i << 1; 
-
-            sKeys1[threadIdx.x << 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
-            sValues1[threadIdx.x << 1] = (j < numElements) ? values1[j] : UINT_MAX; 
-            j++; 
-            sKeys1[(threadIdx.x << 1) + 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
-            sValues1[(threadIdx.x << 1) + 1] = (j < numElements) ? values1[j] : UINT_MAX; 
-        }
-        else
-        {
-            sKeys2[threadIdx.x]   = keys[i];
-            sValues2[threadIdx.x] = values[i];
-        }
-
-        if (!manualCoalesce)
-        {
-            if(threadIdx.x < 16)  
-            {
-                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
-                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
-            }
-            __syncthreads();
-
-            uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
-            uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
-
-            if (fullBlocks || globalOffset < numElements)
-            {
-                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x]);
-                outValues[globalOffset] = sValues1[threadIdx.x];
-            }
-
-            radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
-            globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
-
-            if (fullBlocks || globalOffset < numElements)
-            {
-                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
-                outValues[globalOffset] = sValues1[threadIdx.x + SORT_CTA_SIZE];
-            }
-        }
-        else
-        {
-            __shared__ uint sSizes[16];
-
-            if(threadIdx.x < 16)  
-            {
-                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
-                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
-                sSizes[threadIdx.x]        = sizes[threadIdx.x * totalBlocks + blockId];
-            }
-            __syncthreads();
-
-            // 1 half-warp is responsible for writing out all values for 1 radix. 
-            // Loops if there are more than 16 values to be written out. 
-            // All start indices are rounded down to the nearest multiple of 16, and
-            // all end indices are rounded up to the nearest multiple of 16.
-            // Thus it can do extra work if the start and end indices are not multiples of 16
-            // This is bounded by a factor of 2 (it can do 2X more work at most).
-
-            const uint halfWarpID     = threadIdx.x >> 4;
-
-            const uint halfWarpOffset = threadIdx.x & 0xF;
-            const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
-
-            uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
-            uint endPos   = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - 
-                ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
-            uint numIterations = endPos - startPos;
-
-            uint outOffset = startPos + halfWarpOffset;
-            uint inOffset  = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
-
-            for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
-            {       
-                if( (outOffset >= sOffsets[halfWarpID]) && 
-                    (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) 
-                {
-                    if(blockId < totalBlocks - 1 || outOffset < numElements) 
-                    {
-                        outKeys[outOffset]   = floatUnflip<unflip>(sKeys1[inOffset]);
-                        outValues[outOffset] = sValues1[inOffset];
-                    }
-                }       
-            }
-        }
-
-        if (loop)
-        {
-            blockId += gridDim.x;
-            __syncthreads();
-        }
-        else
-            break;
-    }
-}
-
-/** @brief Sorts all blocks of data independently in shared memory.  
-*  Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
-* 
-* The radix sort is done in two stages.  This stage calls radixSortBlock on each 
-* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
-* 
-* Template parameters are used to generate efficient code for various special cases
-* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
-* differently than arrays that are not.  "flip" is used to only compile in the
-* float flip code when float keys are used.  "loop" is used when persistent CTAs
-* are used. 
-*
-* By persistent CTAs we mean that we launch only as many thread blocks as can 
-* be resident in the GPU and no more, rather than launching as many threads as
-* we have elements. Persistent CTAs loop over blocks of elements until all work
-* is complete.  This can be faster in some cases.  In our tests it is faster
-* for large sorts (and the threshold is higher on compute version 1.1 and earlier
-* GPUs than it is on compute version 1.2 GPUs.
-* 
-* @param[out] keysOut Output of sorted keys GPU main memory
-* @param[in] keysIn Input of unsorted keys in GPU main memory
-* @param[in] numElements Total number of elements to sort
-* @param[in] totalBlocks Total number of blocks to sort
-*
-*/
-template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
-__global__ void 
-LAUNCH_BOUNDS(SORT_CTA_SIZE)
-radixSortBlocksKeysOnly(uint4* keysOut, uint4* keysIn, uint numElements, uint totalBlocks)
-{
-    extern __shared__ uint4 sMem[];
-
-    uint4 key;
-
-    uint blockId = blockIdx.x;
-
-    while (!loop || blockId < totalBlocks)
-    {
-        uint i = blockId * blockDim.x + threadIdx.x;
-        uint idx = i << 2;
-
-        // handle non-full last block if array is not multiple of 1024 numElements
-        if (!fullBlocks && idx+3 >= numElements)
-        {
-            if (idx >= numElements)
-            {
-                key   = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
-            }
-            else
-            {
-                // for non-full block, we handle uint1 values instead of uint4
-                uint *keys1    = (uint*)keysIn;
-
-                key.x = (idx   < numElements) ? floatFlip<flip>(keys1[idx])   : UINT_MAX;
-                key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
-                key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
-                key.w = UINT_MAX;
-            }
-        }
-        else
-        {
-            key = keysIn[i];
-            if (flip)
-            {
-                key.x = floatFlip<flip>(key.x);
-                key.y = floatFlip<flip>(key.y);
-                key.z = floatFlip<flip>(key.z);
-                key.w = floatFlip<flip>(key.w);
-            }            
-        }
-        __syncthreads();
-        radixSortBlockKeysOnly<nbits, startbit>(key);
-
-        // handle non-full last block if array is not multiple of 1024 numElements
-        if(!fullBlocks && idx+3 >= numElements)
-        {
-            if (idx < numElements) 
-            {
-                // for non-full block, we handle uint1 values instead of uint4
-                uint *keys1   = (uint*)keysOut;
-
-                keys1[idx]   = key.x;
-
-                if (idx + 1 < numElements)
-                {
-                    keys1[idx + 1]   = key.y;
-
-                    if (idx + 2 < numElements)
-                    {
-                        keys1[idx + 2]   = key.z;
-                    }
-                }
-            }
-        }
-        else
-        {
-            keysOut[i]   = key;
-        }
-
-        if (loop)
-            blockId += gridDim.x;
-        else
-            break;
-    }
-}
-
-/** @brief Reorders data in the global array.
-*
-* reorderDataKeysOnly shuffles data in the array globally after the radix offsets 
-* have been found. On compute version 1.1 and earlier GPUs, this code depends 
-* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
-* 
-* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
-* that all writes are coalesced using extra work in the kernel.  On later
-* GPUs coalescing rules have been relaxed, so this extra overhead hurts 
-* performance.  On these GPUs we set manualCoalesce=false and directly store
-* the results.
-*
-* Template parameters are used to generate efficient code for various special cases
-* For example, we have to handle arrays that are a multiple of the block size 
-* (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
-* CTAs are used. 
-*
-* By persistent CTAs we mean that we launch only as many thread blocks as can 
-* be resident in the GPU and no more, rather than launching as many threads as
-* we have elements. Persistent CTAs loop over blocks of elements until all work
-* is complete.  This can be faster in some cases.  In our tests it is faster
-* for large sorts (and the threshold is higher on compute version 1.1 and earlier
-* GPUs than it is on compute version 1.2 GPUs.
-* 
-* @param[out] outKeys Output result of reorderDataKeysOnly()
-* @param[in] keys Keys to be reordered
-* @param[in] blockOffsets Start offset for each block
-* @param[in] offsets Offset of each radix within each block
-* @param[in] sizes Number of elements in a block
-* @param[in] numElements Total number of elements
-* @param[in] totalBlocks Total number of blocks
-*/
-template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
-__global__ void 
-LAUNCH_BOUNDS(SORT_CTA_SIZE)
-reorderDataKeysOnly(uint  *outKeys, 
-                                    uint2 *keys, 
-                                    uint  *blockOffsets, 
-                                    uint  *offsets, 
-                                    uint  *sizes, 
-                                    uint   numElements,
-                                    uint   totalBlocks)
-{
-    __shared__ uint2 sKeys2[SORT_CTA_SIZE];
-    __shared__ uint sOffsets[16];
-    __shared__ uint sBlockOffsets[16];
-
-    uint *sKeys1   = (uint*)sKeys2; 
-
-    uint blockId = blockIdx.x;
-
-    while (!loop || blockId < totalBlocks)
-    {
-        uint i = blockId * blockDim.x + threadIdx.x;
-
-        // handle non-full last block if array is not multiple of 1024 numElements
-        if(!fullBlocks && (((i + 1) << 1) > numElements))
-        {
-            uint *keys1   = (uint*)keys;
-            uint j = i << 1; 
-
-            sKeys1[threadIdx.x << 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
-            j++; 
-            sKeys1[(threadIdx.x << 1) + 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
-        }
-        else
-        {
-            sKeys2[threadIdx.x]   = keys[i];
-        }
-
-        if (!manualCoalesce)
-        {
-            if(threadIdx.x < 16)  
-            {
-                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
-                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
-            }
-            __syncthreads();
-
-            uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
-            uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
-
-            if (fullBlocks || globalOffset < numElements)
-            {
-                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x]);
-            }
-
-            radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
-            globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
-
-            if (fullBlocks || globalOffset < numElements)
-            {
-                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
-            }
-        }
-        else
-        {
-            __shared__ uint sSizes[16];
-
-            if(threadIdx.x < 16)  
-            {
-                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
-                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
-                sSizes[threadIdx.x]        = sizes[threadIdx.x * totalBlocks + blockId];
-            }
-            __syncthreads();
-
-            // 1 half-warp is responsible for writing out all values for 1 radix. 
-            // Loops if there are more than 16 values to be written out. 
-            // All start indices are rounded down to the nearest multiple of 16, and
-            // all end indices are rounded up to the nearest multiple of 16.
-            // Thus it can do extra work if the start and end indices are not multiples of 16
-            // This is bounded by a factor of 2 (it can do 2X more work at most).
-
-            const uint halfWarpID     = threadIdx.x >> 4;
-
-            const uint halfWarpOffset = threadIdx.x & 0xF;
-            const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
-
-            uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
-            uint endPos   = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - 
-                ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
-            uint numIterations = endPos - startPos;
-
-            uint outOffset = startPos + halfWarpOffset;
-            uint inOffset  = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
-
-            for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
-            {       
-                if( (outOffset >= sOffsets[halfWarpID]) && 
-                    (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) 
-                {
-                    if(blockId < totalBlocks - 1 || outOffset < numElements) 
-                    {
-                        outKeys[outOffset] = floatUnflip<unflip>(sKeys1[inOffset]);
-                    }
-                }       
-            }
-        }
-
-        if (loop)
-        {
-            blockId += gridDim.x;
-            __syncthreads();
-        }
-        else
-            break;
-    }
-}
-
-/** @} */ // end radixsort functions
-/** @} */ // end cudpp_kernel
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+ 
+#include "cudpp_radixsort.h"
+#include <cudpp_globals.h>
+#include "sharedmem.h"
+#include "cta/radixsort_cta.cu"
+
+#ifdef __DEVICE_EMULATION__
+#define __EMUSYNC  __syncthreads()
+#else
+#define __EMUSYNC
+#endif
+
+/**
+ * @file
+ * radixsort_app.cu
+ *   
+ * @brief CUDPP kernel-level radix sorting routines
+ */
+
+/** \addtogroup cudpp_kernel
+  * @{
+ */
+
+/** @name RadixSort Functions
+ * @{
+ */
+
+
+
+typedef unsigned int uint;
+
+/** @brief And empty kernel used to reset CTA issue hardware
+ **/
+__global__ void emptyKernel() {}
+
+
+/** @brief Does special binary arithmetic before sorting floats
+ * 
+ * Uses floatFlip function to flip bits.
+ * @param[in,out] values  Values to be manipulated
+ * @param[in] numValues Number of values to be flipped 
+ **/
+
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+flipFloats(uint *values, uint numValues)
+{
+    uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; 
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+}
+
+/** @brief Undoes the flips from flipFloats
+ * 
+ * Uses floatUnflip function to unflip bits.
+ * @param[in,out] values  Values to be manipulated
+ * @param[in] numValues Number of values to be unflipped 
+ **/
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+unflipFloats(uint *values, uint numValues)
+{
+    uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; 
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+}
+
+
+/** @brief Optimization for sorts of WARP_SIZE or fewer elements
+ * 
+ * @param[in,out] keys  Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] numElements Number of elements in the sort.
+ */
+template <bool flip>
+__global__ 
+LAUNCH_BOUNDS(WARP_SIZE)
+void radixSortSingleWarp(uint *keys, 
+                         uint *values, 
+                         uint numElements)
+{
+    volatile __shared__ uint sKeys[WARP_SIZE]; //remove class distinctions
+    volatile __shared__ uint sValues[WARP_SIZE];
+    volatile __shared__ uint sFlags[WARP_SIZE];
+
+    sKeys[threadIdx.x]   = floatFlip<flip>(keys[threadIdx.x]);
+    sValues[threadIdx.x] = values[threadIdx.x];
+    
+    __EMUSYNC; // emulation only
+
+    for(uint i = 1; i < numElements; i++)
+    {
+        uint key_i = sKeys[i];
+        uint val_i = sValues[i];
+        
+        sFlags[threadIdx.x] = 0;
+      
+        uint temp, tempval;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+            temp = sKeys[threadIdx.x];
+            tempval = sValues[threadIdx.x];
+            sFlags[threadIdx.x] = 1;
+
+#ifdef __DEVICE_EMULATION__
+        }
+        __EMUSYNC;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+#endif
+            sKeys[threadIdx.x + 1] = temp;
+            sValues[threadIdx.x + 1] = tempval;
+            sFlags[threadIdx.x + 1] = 0;
+        }
+
+        
+        if(sFlags[threadIdx.x] == 1 )
+        {
+            sKeys[threadIdx.x] = key_i;
+            sValues[threadIdx.x] = val_i;
+        }
+
+        __EMUSYNC; // emulation only
+
+    }
+    keys[threadIdx.x]   = floatUnflip<flip>(sKeys[threadIdx.x]);
+    values[threadIdx.x] = sValues[threadIdx.x];
+}
+
+
+/** @brief Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.
+ *
+ * @param[in,out] keys Keys to be sorted
+ * @param[in] numElements Total number of elements to be sorted
+**/
+
+template <bool flip>
+__global__ 
+LAUNCH_BOUNDS(WARP_SIZE)
+void radixSortSingleWarpKeysOnly(uint *keys, 
+                                 uint numElements)
+{
+    volatile __shared__ uint sKeys[WARP_SIZE];
+    volatile __shared__ uint sFlags[WARP_SIZE];
+
+    sKeys[threadIdx.x]   = floatFlip<flip>(keys[threadIdx.x]);
+    
+    __EMUSYNC; // emulation only
+
+    for(uint i = 1; i < numElements; i++)
+    {
+        uint key_i = sKeys[i];
+        
+        sFlags[threadIdx.x] = 0;
+        
+        uint temp;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+            temp = sKeys[threadIdx.x];
+            sFlags[threadIdx.x] = 1;
+#ifdef __DEVICE_EMULATION__
+        }
+        __EMUSYNC;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+#endif
+            sKeys[threadIdx.x + 1] = temp;
+            sFlags[threadIdx.x + 1] = 0;
+        }
+        if(sFlags[threadIdx.x] == 1 )
+        {
+            sKeys[threadIdx.x] = key_i;
+        }
+
+        __EMUSYNC; // emulation only
+
+    }
+    keys[threadIdx.x]   = floatUnflip<flip>(sKeys[threadIdx.x]);
+}
+
+/** @brief sorts all blocks of data independently in shared memory.  
+* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
+* 
+* The radix sort is done in two stages.  This stage calls radixSortBlock on each 
+* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
+* 
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
+* differently than arrays that are not.  "flip" is used to only compile in the
+* float flip code when float keys are used.  "loop" is used when persistent CTAs
+* are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[out] keysOut Output of sorted keys 
+* @param[out] valuesOut Output of associated values 
+* @param[in]  keysIn Input of unsorted keys in GPU 
+* @param[in]  valuesIn Input of associated input values 
+* @param[in]  numElements Total number of elements to sort
+* @param[in]  totalBlocks The number of blocks of data to sort
+*/
+template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+radixSortBlocks(uint4* keysOut, uint4* valuesOut, 
+                                uint4* keysIn, uint4* valuesIn, 
+                                uint numElements, uint totalBlocks)
+{
+    extern __shared__ uint4 sMem[];
+
+    uint4 key, value;
+
+
+    uint blockId = blockIdx.x;
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+        uint idx = i << 2;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if (!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx >= numElements)
+            {
+                key   = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+                value = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+            }
+            else
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1    = (uint*)keysIn;
+                uint *values1  = (uint*)valuesIn;
+
+                key.x = (idx   < numElements) ? floatFlip<flip>(keys1[idx])   : UINT_MAX;
+                key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
+                key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
+                key.w = UINT_MAX;
+
+                value.x = (idx   < numElements) ? values1[idx]   : UINT_MAX;
+                value.y = (idx+1 < numElements) ? values1[idx+1] : UINT_MAX;
+                value.z = (idx+2 < numElements) ? values1[idx+2] : UINT_MAX;
+                value.w = UINT_MAX;
+            }
+        }
+        else
+        {
+            key = keysIn[i];
+            value = valuesIn[i];
+
+            if (flip)
+            {
+                key.x = floatFlip<flip>(key.x);
+                key.y = floatFlip<flip>(key.y);
+                key.z = floatFlip<flip>(key.z);
+                key.w = floatFlip<flip>(key.w);
+            }
+        }
+        __syncthreads();
+        radixSortBlock<nbits, startbit>(key, value);
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx < numElements) 
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1   = (uint*)keysOut;
+                uint *values1 = (uint*)valuesOut;
+
+                keys1[idx]   = key.x;
+                values1[idx] = value.x;
+
+                if (idx + 1 < numElements)
+                {
+                    keys1[idx + 1]   = key.y;
+                    values1[idx + 1] = value.y;
+
+                    if (idx + 2 < numElements)
+                    {
+                        keys1[idx + 2]   = key.z;
+                        values1[idx + 2] = value.z;
+                    }
+                }
+            }
+        }
+        else
+        {
+            keysOut[i]   = key;
+            valuesOut[i] = value;
+        }
+
+        if (loop)        
+            blockId += gridDim.x;
+        else
+            break;            
+    }
+}
+
+/** @brief Computes the number of keys of each radix in each block stores offset.
+*
+* Given an array with blocks sorted according to a 4-bit radix group, each 
+* block counts the number of keys that fall into each radix in the group, and 
+* finds the starting offset of each radix in the block.  It then writes the radix 
+* counts to the counters array, and the starting offsets to the blockOffsets array.
+*
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size 
+* (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
+* CTAs are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[in] keys Input keys
+* @param[out] counters Radix count for each block
+* @param[out] blockOffsets The offset address for each block
+* @param[in] numElements Total number of elements
+* @param[in] totalBlocks Total number of blocks
+**/
+template<uint startbit, bool fullBlocks, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+findRadixOffsets(uint2 *keys, 
+                 uint  *counters, 
+                 uint  *blockOffsets, 
+                 uint   numElements,
+                 uint   totalBlocks)
+{
+    extern __shared__ uint sRadix1[];
+    __shared__ uint  sStartPointers[16];
+
+    uint blockId = blockIdx.x;   
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint2 radix2;
+
+        uint i       = blockId * blockDim.x + threadIdx.x;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && ((i + 1) << 1 ) > numElements )
+        {
+            // handle uint1 rather than uint2 for non-full blocks
+            uint *keys1 = (uint*)keys;
+            uint j = i << 1; 
+
+            radix2.x = (j < numElements) ? keys1[j] : UINT_MAX; 
+            j++;
+            radix2.y = (j < numElements) ? keys1[j] : UINT_MAX;
+        }
+        else
+        {
+            radix2 = keys[i];
+        }
+
+        sRadix1[2 * threadIdx.x]     = (radix2.x >> startbit) & 0xF;
+        sRadix1[2 * threadIdx.x + 1] = (radix2.y >> startbit) & 0xF;
+
+        // Finds the position where the sRadix1 entries differ and stores start 
+        // index for each radix.
+        if(threadIdx.x < 16) 
+        { 
+            sStartPointers[threadIdx.x] = 0; 
+        }
+        __syncthreads();
+
+        if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) 
+        {
+            sStartPointers[sRadix1[threadIdx.x]] = threadIdx.x;
+        }
+        if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]) 
+        {
+            sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE]] = threadIdx.x + SORT_CTA_SIZE;
+        }
+        __syncthreads();
+
+        if(threadIdx.x < 16) 
+        {
+            blockOffsets[blockId*16 + threadIdx.x] = sStartPointers[threadIdx.x];
+        }
+        __syncthreads();
+
+        // Compute the sizes of each block.
+        if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) 
+        {
+            sStartPointers[sRadix1[threadIdx.x - 1]] = 
+                threadIdx.x - sStartPointers[sRadix1[threadIdx.x - 1]];
+        }
+        if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1] ) 
+        {
+            sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]] = 
+                threadIdx.x + SORT_CTA_SIZE - sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]];
+        }
+
+
+        if(threadIdx.x == SORT_CTA_SIZE - 1) 
+        {
+            sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]] = 
+                2 * SORT_CTA_SIZE - sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]];
+        }
+        __syncthreads();
+
+        if(threadIdx.x < 16) 
+        {
+            counters[threadIdx.x * totalBlocks + blockId] = 
+                sStartPointers[threadIdx.x];
+        }
+
+        if (loop)
+            blockId += gridDim.x;
+        else
+            break;
+    }
+}
+
+
+/**@brief Reorders data in the global array.
+*
+* reorderData shuffles data in the array globally after the radix
+* offsets have been found. On compute version 1.1 and earlier GPUs, this code depends 
+* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+* 
+* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+* that all writes are coalesced using extra work in the kernel.  On later
+* GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+* performance.  On these GPUs we set manualCoalesce=false and directly store
+* the results.
+*
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size 
+* (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+* CTAs are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+*
+* @param[out] outKeys Output of sorted keys 
+* @param[out] outValues Output of associated values 
+* @param[in] keys Input of unsorted keys in GPU 
+* @param[in] values Input of associated input values 
+* @param[in] blockOffsets The offset address for each block
+* @param[in] offsets Address of each radix within each block
+* @param[in] sizes Number of elements in a block
+* @param[in] numElements Total number of elements
+* @param[in] totalBlocks Total number of data blocks to process
+*
+* @todo Args that are const below should be prototyped as const
+**/
+template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+reorderData(uint  *outKeys, 
+            uint  *outValues, 
+            uint2 *keys, 
+            uint2 *values, 
+            uint  *blockOffsets, 
+            uint  *offsets, 
+            uint  *sizes, 
+            uint   numElements,
+            uint   totalBlocks)
+{
+    __shared__ uint2 sKeys2[SORT_CTA_SIZE];
+    __shared__ uint2 sValues2[SORT_CTA_SIZE];
+    __shared__ uint sOffsets[16];
+    __shared__ uint sBlockOffsets[16];
+
+    uint *sKeys1   = (uint*)sKeys2; 
+    uint *sValues1 = (uint*)sValues2; 
+
+    uint blockId = blockIdx.x;   
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && (((i + 1) << 1) > numElements))
+        {
+            uint *keys1   = (uint*)keys;
+            uint *values1 = (uint*)values;
+            uint j = i << 1; 
+
+            sKeys1[threadIdx.x << 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+            sValues1[threadIdx.x << 1] = (j < numElements) ? values1[j] : UINT_MAX; 
+            j++; 
+            sKeys1[(threadIdx.x << 1) + 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+            sValues1[(threadIdx.x << 1) + 1] = (j < numElements) ? values1[j] : UINT_MAX; 
+        }
+        else
+        {
+            sKeys2[threadIdx.x]   = keys[i];
+            sValues2[threadIdx.x] = values[i];
+        }
+
+        if (!manualCoalesce)
+        {
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+            }
+            __syncthreads();
+
+            uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
+            uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x]);
+                outValues[globalOffset] = sValues1[threadIdx.x];
+            }
+
+            radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
+            globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
+                outValues[globalOffset] = sValues1[threadIdx.x + SORT_CTA_SIZE];
+            }
+        }
+        else
+        {
+            __shared__ uint sSizes[16];
+
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+                sSizes[threadIdx.x]        = sizes[threadIdx.x * totalBlocks + blockId];
+            }
+            __syncthreads();
+
+            // 1 half-warp is responsible for writing out all values for 1 radix. 
+            // Loops if there are more than 16 values to be written out. 
+            // All start indices are rounded down to the nearest multiple of 16, and
+            // all end indices are rounded up to the nearest multiple of 16.
+            // Thus it can do extra work if the start and end indices are not multiples of 16
+            // This is bounded by a factor of 2 (it can do 2X more work at most).
+
+            const uint halfWarpID     = threadIdx.x >> 4;
+
+            const uint halfWarpOffset = threadIdx.x & 0xF;
+            const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
+
+            uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
+            uint endPos   = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - 
+                ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
+            uint numIterations = endPos - startPos;
+
+            uint outOffset = startPos + halfWarpOffset;
+            uint inOffset  = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
+
+            for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
+            {       
+                if( (outOffset >= sOffsets[halfWarpID]) && 
+                    (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) 
+                {
+                    if(blockId < totalBlocks - 1 || outOffset < numElements) 
+                    {
+                        outKeys[outOffset]   = floatUnflip<unflip>(sKeys1[inOffset]);
+                        outValues[outOffset] = sValues1[inOffset];
+                    }
+                }       
+            }
+        }
+
+        if (loop)
+        {
+            blockId += gridDim.x;
+            __syncthreads();
+        }
+        else
+            break;
+    }
+}
+
+/** @brief Sorts all blocks of data independently in shared memory.  
+*  Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
+* 
+* The radix sort is done in two stages.  This stage calls radixSortBlock on each 
+* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
+* 
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
+* differently than arrays that are not.  "flip" is used to only compile in the
+* float flip code when float keys are used.  "loop" is used when persistent CTAs
+* are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[out] keysOut Output of sorted keys GPU main memory
+* @param[in] keysIn Input of unsorted keys in GPU main memory
+* @param[in] numElements Total number of elements to sort
+* @param[in] totalBlocks Total number of blocks to sort
+*
+*/
+template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+radixSortBlocksKeysOnly(uint4* keysOut, uint4* keysIn, uint numElements, uint totalBlocks)
+{
+    extern __shared__ uint4 sMem[];
+
+    uint4 key;
+
+    uint blockId = blockIdx.x;
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+        uint idx = i << 2;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if (!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx >= numElements)
+            {
+                key   = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+            }
+            else
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1    = (uint*)keysIn;
+
+                key.x = (idx   < numElements) ? floatFlip<flip>(keys1[idx])   : UINT_MAX;
+                key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
+                key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
+                key.w = UINT_MAX;
+            }
+        }
+        else
+        {
+            key = keysIn[i];
+            if (flip)
+            {
+                key.x = floatFlip<flip>(key.x);
+                key.y = floatFlip<flip>(key.y);
+                key.z = floatFlip<flip>(key.z);
+                key.w = floatFlip<flip>(key.w);
+            }            
+        }
+        __syncthreads();
+        radixSortBlockKeysOnly<nbits, startbit>(key);
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx < numElements) 
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1   = (uint*)keysOut;
+
+                keys1[idx]   = key.x;
+
+                if (idx + 1 < numElements)
+                {
+                    keys1[idx + 1]   = key.y;
+
+                    if (idx + 2 < numElements)
+                    {
+                        keys1[idx + 2]   = key.z;
+                    }
+                }
+            }
+        }
+        else
+        {
+            keysOut[i]   = key;
+        }
+
+        if (loop)
+            blockId += gridDim.x;
+        else
+            break;
+    }
+}
+
+/** @brief Reorders data in the global array.
+*
+* reorderDataKeysOnly shuffles data in the array globally after the radix offsets 
+* have been found. On compute version 1.1 and earlier GPUs, this code depends 
+* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+* 
+* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+* that all writes are coalesced using extra work in the kernel.  On later
+* GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+* performance.  On these GPUs we set manualCoalesce=false and directly store
+* the results.
+*
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size 
+* (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+* CTAs are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[out] outKeys Output result of reorderDataKeysOnly()
+* @param[in] keys Keys to be reordered
+* @param[in] blockOffsets Start offset for each block
+* @param[in] offsets Offset of each radix within each block
+* @param[in] sizes Number of elements in a block
+* @param[in] numElements Total number of elements
+* @param[in] totalBlocks Total number of blocks
+*/
+template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+reorderDataKeysOnly(uint  *outKeys, 
+                                    uint2 *keys, 
+                                    uint  *blockOffsets, 
+                                    uint  *offsets, 
+                                    uint  *sizes, 
+                                    uint   numElements,
+                                    uint   totalBlocks)
+{
+    __shared__ uint2 sKeys2[SORT_CTA_SIZE];
+    __shared__ uint sOffsets[16];
+    __shared__ uint sBlockOffsets[16];
+
+    uint *sKeys1   = (uint*)sKeys2; 
+
+    uint blockId = blockIdx.x;
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && (((i + 1) << 1) > numElements))
+        {
+            uint *keys1   = (uint*)keys;
+            uint j = i << 1; 
+
+            sKeys1[threadIdx.x << 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+            j++; 
+            sKeys1[(threadIdx.x << 1) + 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+        }
+        else
+        {
+            sKeys2[threadIdx.x]   = keys[i];
+        }
+
+        if (!manualCoalesce)
+        {
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+            }
+            __syncthreads();
+
+            uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
+            uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x]);
+            }
+
+            radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
+            globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
+            }
+        }
+        else
+        {
+            __shared__ uint sSizes[16];
+
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+                sSizes[threadIdx.x]        = sizes[threadIdx.x * totalBlocks + blockId];
+            }
+            __syncthreads();
+
+            // 1 half-warp is responsible for writing out all values for 1 radix. 
+            // Loops if there are more than 16 values to be written out. 
+            // All start indices are rounded down to the nearest multiple of 16, and
+            // all end indices are rounded up to the nearest multiple of 16.
+            // Thus it can do extra work if the start and end indices are not multiples of 16
+            // This is bounded by a factor of 2 (it can do 2X more work at most).
+
+            const uint halfWarpID     = threadIdx.x >> 4;
+
+            const uint halfWarpOffset = threadIdx.x & 0xF;
+            const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
+
+            uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
+            uint endPos   = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - 
+                ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
+            uint numIterations = endPos - startPos;
+
+            uint outOffset = startPos + halfWarpOffset;
+            uint inOffset  = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
+
+            for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
+            {       
+                if( (outOffset >= sOffsets[halfWarpID]) && 
+                    (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) 
+                {
+                    if(blockId < totalBlocks - 1 || outOffset < numElements) 
+                    {
+                        outKeys[outOffset] = floatUnflip<unflip>(sKeys1[inOffset]);
+                    }
+                }       
+            }
+        }
+
+        if (loop)
+        {
+            blockId += gridDim.x;
+            __syncthreads();
+        }
+        else
+            break;
+    }
+}
+
+/** @} */ // end radixsort functions
+/** @} */ // end cudpp_kernel
diff --git a/lib/gpu/cudpp_mini/license.txt b/lib/gpu/cudpp_mini/license.txt
index 4a14588e1d..2b0d902586 100644
--- a/lib/gpu/cudpp_mini/license.txt
+++ b/lib/gpu/cudpp_mini/license.txt
@@ -23,3 +23,4 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
\ No newline at end of file
diff --git a/lib/gpu/cudpp_mini/radixsort_app.cu b/lib/gpu/cudpp_mini/radixsort_app.cu
index 966fe609e7..4f5e2e07f7 100644
--- a/lib/gpu/cudpp_mini/radixsort_app.cu
+++ b/lib/gpu/cudpp_mini/radixsort_app.cu
@@ -1,993 +1,993 @@
-// -------------------------------------------------------------
-// CUDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision$
-// $Date$
-// ------------------------------------------------------------- 
-// This source code is distributed under the terms of license.txt 
-// in the root directory of this source distribution.
-// ------------------------------------------------------------- 
-
-/**
- * @file
- * radixsort_app.cu
- *   
- * @brief CUDPP application-level radix sorting routines
- */
-
-/** @addtogroup cudpp_app 
- * @{
- */
-
-/** @name RadixSort Functions
- * @{
- */
- 
-
-#include "cudpp.h"
-#include "cudpp_util.h"
-#include "cudpp_radixsort.h"
-#include "cudpp_scan.h"
-#include "kernel/radixsort_kernel.cu"
-
-#include <cutil.h>
-#include <cstdlib>
-#include <cstdio>
-#include <assert.h>
-
-typedef unsigned int uint;
-
-/** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step, 
-* starting at startbit.
-* 
-* Uses cudppScanDispatch() for the prefix sum of radix counters.
-* 
-* @param[in,out] keys Keys to be sorted.
-* @param[in,out] values Associated values to be sorted (through keys).
-* @param[in] plan Configuration information for RadixSort.
-* @param[in] numElements Number of elements in the sort.
-**/
-template<uint nbits, uint startbit, bool flip, bool unflip>
-void radixSortStep(uint *keys, 
-                   uint *values, 
-                   const CUDPPRadixSortPlan *plan,
-                   uint numElements)
-{
-    const uint eltsPerBlock = SORT_CTA_SIZE * 4;
-    const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
-
-    bool fullBlocks = ((numElements % eltsPerBlock) == 0);
-    uint numBlocks = (fullBlocks) ? 
-        (numElements / eltsPerBlock) : 
-    (numElements / eltsPerBlock + 1);
-    uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
-        (numElements / eltsPerBlock2) : 
-    (numElements / eltsPerBlock2 + 1);
-
-    bool loop = numBlocks > 65535;
-    uint blocks = loop ? 65535 : numBlocks;
-    uint blocksFind = loop ? 65535 : numBlocks2;
-    uint blocksReorder = loop ? 65535 : numBlocks2;
-
-    uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
-
-    bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
-
-    if (persist)
-    {
-        loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
-        
-        blocks = numBlocks;
-        blocksFind = numBlocks2;
-        blocksReorder = numBlocks2;
-
-        // Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
-        // on GT200, resulting in better scheduling and lower run times
-        if (startbit > 0)
-        {
-            emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
-        }
-    }
-
-    if (fullBlocks)
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) : 
-                               numCTAs(radixSortBlocks<4, 0, true, false, true>);
-            }
-
-            radixSortBlocks<nbits, startbit, true, flip, true>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
-        }
-        else
-        {
-            radixSortBlocks<nbits, startbit, true, flip, false>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
-        }
-    }
-    else
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) : 
-                                numCTAs(radixSortBlocks<4, 0, false, false, true>);
-            }
-
-            radixSortBlocks<nbits, startbit, false, flip, true>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
-        }
-        else
-        {
-            radixSortBlocks<nbits, startbit, false, flip, false>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
-        }
-    }
-
-    CUT_CHECK_ERROR("radixSortBlocks");
-
-    if (fullBlocks)
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocksFind = numCTAs(findRadixOffsets<0, true, true>);
-            }
-            findRadixOffsets<startbit, true, true>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-        }
-        else
-        {
-            findRadixOffsets<startbit, true, false>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-        }
-    }
-    else
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocksFind = numCTAs(findRadixOffsets<0, false, true>);
-            }
-            findRadixOffsets<startbit, false, true>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-        }
-        else
-        {
-            findRadixOffsets<startbit, false, false>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-        }
-    }
-
-    CUT_CHECK_ERROR("findRadixOffsets");
-
-    cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
-
-    if (fullBlocks)
-    {
-        if (plan->m_bManualCoalesce)
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
-                                             numCTAs(reorderData<0, true, true, false, true>);
-                }
-                reorderData<startbit, true, true, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-            else
-            {
-                reorderData<startbit, true, true, unflip, false>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-        }
-        else
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
-                                             numCTAs(reorderData<0, true, false, false, true>);
-                }
-                reorderData<startbit, true, false, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-            else
-            {
-                reorderData<startbit, true, false, unflip, false>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-        }
-    }
-    else
-    {
-        if (plan->m_bManualCoalesce)
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ? 
-                        numCTAs(reorderData<0, false, true, true, true>) :
-                        numCTAs(reorderData<0, false, true, false, true>);
-                }
-                reorderData<startbit, false, true, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-            else
-            {
-                reorderData<startbit, false, true, unflip, false>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-        }
-        else
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ?
-                        numCTAs(reorderData<0, false, false, true, true>) :
-                        numCTAs(reorderData<0, false, false, false, true>);
-                }
-                reorderData<startbit, false, false, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-            else
-            {
-                reorderData<startbit, false, false, unflip, false>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
-                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
-            }
-        }
-    }
-
-    CUT_CHECK_ERROR("radixSortStep");
-}
-
-/**
- * @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
- * 
- * @param[in,out] keys  Keys to be sorted.
- * @param[in,out] values Associated values to be sorted (through keys).
- * @param numElements Number of elements in the sort.
-**/
-template <bool flip>
-void radixSortSingleBlock(uint *keys, 
-                          uint *values, 
-                          uint numElements)
-{
-    bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
-    if (fullBlocks)
-    {
-        radixSortBlocks<32, 0, true, flip, false>
-            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)keys, (uint4*)values, 
-                 (uint4*)keys, (uint4*)values, 
-                 numElements, 0);
-    }
-    else
-    {
-        radixSortBlocks<32, 0, false, flip, false>
-            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)keys, (uint4*)values, 
-                 (uint4*)keys, (uint4*)values, 
-                 numElements, 0);
-    }
-
-    if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
-
-    CUT_CHECK_ERROR("radixSortSingleBlock");
-}
-
-/**
- * @brief Main radix sort function
- * 
- * Main radix sort function.  Sorts in place in the keys and values arrays,
- * but uses the other device arrays as temporary storage.  All pointer 
- * parameters are device pointers.  Uses cudppScan() for the prefix sum of 
- * radix counters.
- * 
- * @param[in,out] keys Keys to be sorted.
- * @param[in,out] values Associated values to be sorted (through keys).
- * @param[in] plan Configuration information for RadixSort.
- * @param[in] numElements Number of elements in the sort.
- * @param[in] flipBits Is set true if key datatype is a float 
- *                 (neg. numbers) for special float sorting operations.
- * @param[in] keyBits Number of interesting bits in the key
- **/
-void radixSort(uint *keys,                         
-               uint* values,               
-               const CUDPPRadixSortPlan *plan,               
-               size_t numElements,
-               bool flipBits,
-               int keyBits)
-{
-    if(numElements <= WARP_SIZE)
-    {
-        if (flipBits)
-            radixSortSingleWarp<true><<<1, numElements>>>
-                (keys, values, numElements);
-        else
-            radixSortSingleWarp<false><<<1, numElements>>>
-                (keys, values, numElements);
-
-        CUT_CHECK_ERROR("radixSortSingleWarp");        
-        return;
-    }
-#ifdef __DEVICE_EMULATION__
-    printf("bits: %d\n", keyBits);
-#endif
-    
-    if(numElements <= SORT_CTA_SIZE * 4)
-    {
-        if (flipBits)
-            radixSortSingleBlock<true>(keys, values, numElements);
-        else
-            radixSortSingleBlock<false>(keys, values, numElements);
-        return;
-    }
-        
-    // flip float bits on the first pass, unflip on the last pass    
-    if (flipBits) 
-    {               
-        radixSortStep<4,  0, true, false>
-            (keys, values, plan, numElements);            
-    }
-    else
-    {     
-        radixSortStep<4,  0, false, false>
-            (keys, values, plan, numElements);           
-    }
-
-    if (keyBits > 4)
-    {                   
-        radixSortStep<4,  4, false, false>
-            (keys, values, plan, numElements);            
-    }
-    if (keyBits > 8)
-    {                                   
-        radixSortStep<4,  8, false, false>
-            (keys, values, plan, numElements);            
-    }
-    if (keyBits > 12)
-    {                   
-        radixSortStep<4, 12, false, false>
-            (keys, values, plan, numElements);            
-    }
-    if (keyBits > 16)
-    {                   
-        radixSortStep<4, 16, false, false>
-            (keys, values, plan, numElements);            
-    }
-    if (keyBits > 20)
-    {                   
-        radixSortStep<4, 20, false, false>
-            (keys, values, plan, numElements);            
-    }
-    if (keyBits > 24)
-    {                   
-        radixSortStep<4, 24, false, false>
-            (keys, values, plan, numElements);         
-    }
-    if (keyBits > 28)
-    {
-        if (flipBits) // last pass
-        {                       
-            radixSortStep<4, 28, false, true>
-                (keys, values, plan, numElements);
-        }
-        else
-        {                       
-            radixSortStep<4, 28, false, false>
-                (keys, values, plan, numElements);            
-        }
-    }
-}
-
-/**
- * @brief Wrapper to call main radix sort function. For float configuration.
- * 
- * Calls the main radix sort function. For float configuration.
- * 
- * @param[in,out] keys Keys to be sorted.
- * @param[in,out] values Associated values to be sorted (through keys).
- * @param[in] plan Configuration information for RadixSort.
- * @param[in] numElements Number of elements in the sort.
- * @param[in] negativeKeys Is set true if key datatype has neg. numbers.
- * @param[in] keyBits Number of interesting bits in the key
- **/
-extern "C"
-void radixSortFloatKeys(float* keys, 
-                        uint* values, 
-                        const CUDPPRadixSortPlan *plan,
-                        size_t numElements,            
-                        bool  negativeKeys,
-                        int keyBits)
-{
-   
-    radixSort((uint*)keys, (uint*)values, plan, 
-              numElements, negativeKeys, keyBits);
-}
-
-/** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step, 
- * starting at startbit.
- * 
- * @param[in,out] keys  Keys to be sorted.
- * @param[in] plan Configuration information for RadixSort.
- * @param[in] numElements Number of elements in the sort. 
-**/
-template<uint nbits, uint startbit, bool flip, bool unflip>
-void radixSortStepKeysOnly(uint *keys, 
-                           const CUDPPRadixSortPlan *plan,
-                           uint numElements)
-{
-    const uint eltsPerBlock = SORT_CTA_SIZE * 4;
-    const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
-
-    bool fullBlocks = ((numElements % eltsPerBlock) == 0);
-    uint numBlocks = (fullBlocks) ? 
-        (numElements / eltsPerBlock) : 
-    (numElements / eltsPerBlock + 1);
-    uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
-        (numElements / eltsPerBlock2) : 
-    (numElements / eltsPerBlock2 + 1);
-
-    bool loop = numBlocks > 65535;
-    
-    uint blocks = loop ? 65535 : numBlocks;
-    uint blocksFind = loop ? 65535 : numBlocks2;
-    uint blocksReorder = loop ? 65535 : numBlocks2;
-
-    uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
-
-    bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
-
-    if (persist)
-    {
-        loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
-        
-        blocks = numBlocks;
-        blocksFind = numBlocks2;
-        blocksReorder = numBlocks2;
-    }
-
-    if (fullBlocks)
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) : 
-                                numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
-            }
-
-            radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
-        }
-        else
-            radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
-    }
-    else
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) : 
-                                numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
-            }
-
-            radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
-        }
-        else
-            radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
-                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
-
-    }
-
-    if (fullBlocks)
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocksFind = numCTAs(findRadixOffsets<0, true, true>);
-            }
-            findRadixOffsets<startbit, true, true>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-        }
-        else
-            findRadixOffsets<startbit, true, false>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-    }
-    else
-    {
-        if (loop)
-        {
-            if (persist) 
-            {
-                blocksFind = numCTAs(findRadixOffsets<0, false, true>);
-            }
-            findRadixOffsets<startbit, false, true>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-        }
-        else
-            findRadixOffsets<startbit, false, false>
-                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
-                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
-
-    }
-
-    cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
-
-    if (fullBlocks)
-    {
-        if (plan->m_bManualCoalesce)
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ? 
-                        numCTAs(reorderDataKeysOnly<0, true, true, true, true>) : 
-                        numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
-                }
-                reorderDataKeysOnly<startbit, true, true, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                    numElements, numBlocks2);
-            }
-            else
-                reorderDataKeysOnly<startbit, true, true, unflip, false>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                     numElements, numBlocks2);
-        }
-        else
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ?
-                        numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
-                        numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
-                }
-                reorderDataKeysOnly<startbit, true, false, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                    numElements, numBlocks2);
-            }
-            else
-                reorderDataKeysOnly<startbit, true, false, unflip, false>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                     numElements, numBlocks2);
-        }
-    }
-    else
-    {
-        if (plan->m_bManualCoalesce)
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ? 
-                        numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
-                        numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
-                }
-                reorderDataKeysOnly<startbit, false, true, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                    numElements, numBlocks2);
-            }
-            else
-                reorderDataKeysOnly<startbit, false, true, unflip, false>
-                <<<blocksReorder, SORT_CTA_SIZE>>>
-                (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                numElements, numBlocks2);
-        }
-        else
-        {
-            if (loop)
-            {
-                if (persist) 
-                {
-                    blocksReorder = unflip ?
-                        numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
-                        numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
-                }
-                reorderDataKeysOnly<startbit, false, false, unflip, true>
-                    <<<blocksReorder, SORT_CTA_SIZE>>>
-                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                    numElements, numBlocks2);
-            }
-            else
-                reorderDataKeysOnly<startbit, false, false, unflip, false>
-                <<<blocksReorder, SORT_CTA_SIZE>>>
-                (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
-                numElements, numBlocks2);
-        }
-    }
-
-    CUT_CHECK_ERROR("radixSortStepKeysOnly");
-}
-
-/**
- * @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
- * 
- * @param[in,out] keys Keys to be sorted.
- * @param numElements Number of elements in the sort.
-**/
-template <bool flip>
-void radixSortSingleBlockKeysOnly(uint *keys, 
-                                  uint numElements)
-{
-    bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
-    if (fullBlocks)
-    {
-        radixSortBlocksKeysOnly<32, 0, true, flip, false>
-            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-            ((uint4*)keys, (uint4*)keys, numElements, 1 );
-    }
-    else
-    {
-        radixSortBlocksKeysOnly<32, 0, false, flip, false>
-            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
-            ((uint4*)keys, (uint4*)keys, numElements, 1 );
-    }
-
-    if (flip)
-        unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
-
-
-    CUT_CHECK_ERROR("radixSortSingleBlock");
-}
-
-/** 
- * @brief Main radix sort function. For keys only configuration.
- *
- * Main radix sort function.  Sorts in place in the keys array,
- * but uses the other device arrays as temporary storage.  All pointer 
- * parameters are device pointers.  Uses scan for the prefix sum of
- * radix counters.
- * 
- * @param[in,out] keys Keys to be sorted.
- * @param[in] plan Configuration information for RadixSort.
- * @param[in] flipBits Is set true if key datatype is a float (neg. numbers) 
- *        for special float sorting operations.
- * @param[in] numElements Number of elements in the sort.
- * @param[in] keyBits Number of interesting bits in the key
-**/
-extern "C"
-void radixSortKeysOnly(uint *keys,
-                       const CUDPPRadixSortPlan *plan, 
-                       bool flipBits, 
-                       size_t numElements,
-                       int keyBits)
-{
-
-    if(numElements <= WARP_SIZE)
-    {
-        if (flipBits)
-            radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
-        else
-            radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
-        return;
-    }
-    if(numElements <= SORT_CTA_SIZE * 4)
-    {
-        if (flipBits)
-            radixSortSingleBlockKeysOnly<true>(keys, numElements);
-        else
-            radixSortSingleBlockKeysOnly<false>(keys, numElements);
-        return;
-    }
-
-    // flip float bits on the first pass, unflip on the last pass
-    if (flipBits) 
-    {
-        radixSortStepKeysOnly<4,  0, true, false>(keys, plan, numElements);
-    }
-    else
-    {
-        radixSortStepKeysOnly<4,  0, false, false>(keys, plan, numElements);
-    }
-
-    if (keyBits > 4)
-    {
-        radixSortStepKeysOnly<4,  4, false, false>(keys, plan, numElements);
-    }
-    if (keyBits > 8)
-    {
-        radixSortStepKeysOnly<4,  8, false, false>(keys, plan, numElements);
-    }
-    if (keyBits > 12)
-    {
-        radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
-    }
-    if (keyBits > 16)
-    {
-        radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
-    }
-    if (keyBits > 20)
-    {
-        radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
-    }
-    if (keyBits > 24)
-    {
-       radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
-    }
-    if (keyBits > 28)
-    {
-        if (flipBits) // last pass
-        {
-            radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
-        }
-        else
-        {
-            radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
-        }
-    }
-}
-
-/**
- * @brief Wrapper to call main radix sort function. For floats and keys only.
- *
- * Calls the radixSortKeysOnly function setting parameters for floats.
- * 
- * @param[in,out] keys Keys to be sorted.
- * @param[in] plan Configuration information for RadixSort.
- * @param[in] negativeKeys Is set true if key flipBits is to be true in 
- *                     radixSortKeysOnly().
- * @param[in] numElements Number of elements in the sort.
- * @param[in] keyBits Number of interesting bits in the key
-**/
-extern "C"
-void radixSortFloatKeysOnly(float *keys, 
-                            const CUDPPRadixSortPlan *plan,                        
-                            bool  negativeKeys,
-                            size_t numElements,
-                            int keyBits)
-{
-    radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
-}
-
-extern "C"
-void initDeviceParameters(CUDPPRadixSortPlan *plan)
-{
-    int deviceID = -1;
-    if (cudaSuccess == cudaGetDevice(&deviceID))
-    {
-        cudaDeviceProp devprop;
-        cudaGetDeviceProperties(&devprop, deviceID);
-
-        int smVersion = devprop.major * 10 + devprop.minor;
-
-        // sm_12 and later devices don't need help with coalesce in reorderData kernel
-        plan->m_bManualCoalesce = (smVersion < 12);
-
-        // sm_20 and later devices are better off not using persistent CTAs
-        plan->m_bUsePersistentCTAs = (smVersion < 20);
-
-        if (plan->m_bUsePersistentCTAs)
-        {
-            // The following is only true on pre-sm_20 devices (pre-Fermi):
-            // Empirically we have found that for some (usually larger) sort
-            // sizes it is better to use exactly as many "persistent" CTAs 
-            // as can fill the GPU, which loop over the "blocks" of work. For smaller 
-            // arrays it is better to use the typical CUDA approach of launching one CTA
-            // per block of work.
-            // 0-element of these two-element arrays is for key-value sorts
-            // 1-element is for key-only sorts
-            plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
-            plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
-            plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
-            plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
-
-            // create a map of function pointers to register counts for more accurate occupancy calculation
-            // Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
-            // Note we only insert the "loop" version of the kernels (the one with the last template param = true)
-            // Because those are the only ones that require persistent CTAs that maximally fill the device.
-            computeNumCTAs(radixSortBlocks<4, 0, false, false, true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            computeNumCTAs(radixSortBlocks<4, 0, false, true,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            computeNumCTAs(radixSortBlocks<4, 0, true, false,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            computeNumCTAs(radixSortBlocks<4, 0, true, true,  true>,          4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            
-            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>,   4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-
-            computeNumCTAs(findRadixOffsets<0, false, true>,                  3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-            computeNumCTAs(findRadixOffsets<0, true, true>,                   3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
-
-            computeNumCTAs(reorderData<0, false, false, false, true>,         0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderData<0, false, false, true, true>,          0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderData<0, false, true, false, true>,          0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderData<0, false, true, true, true>,           0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderData<0, true, false, false, true>,          0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderData<0, true, false, true, true>,           0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderData<0, true, true, false, true>,           0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderData<0, true, true, true, true>,            0,                                SORT_CTA_SIZE);
-
-            computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>,  0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>,  0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>,   0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>,  0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>,   0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>,   0,                                SORT_CTA_SIZE);
-            computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>,    0,                                SORT_CTA_SIZE);
-                   
-            computeNumCTAs(emptyKernel,                                       0,                                SORT_CTA_SIZE);
-        }
-    }
-}
-
-/**
- * @brief From the programmer-specified sort configuration, 
- *        creates internal memory for performing the sort.
- * 
- * @param[in] plan Pointer to CUDPPRadixSortPlan object
-**/
-extern "C"
-void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
-{               
-        
-    unsigned int numElements = plan->m_numElements;
-
-    unsigned int numBlocks = 
-        ((numElements % (SORT_CTA_SIZE * 4)) == 0) ? 
-            (numElements / (SORT_CTA_SIZE * 4)) : 
-            (numElements / (SORT_CTA_SIZE * 4) + 1);
-                        
-    switch(plan->m_config.datatype)
-    {
-    case CUDPP_UINT:
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys, 
-                                  numElements * sizeof(unsigned int)));
-
-        if (!plan->m_bKeysOnly)
-            CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues, 
-                           numElements * sizeof(unsigned int)));
-
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters, 
-                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
-
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
-                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
-
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets, 
-                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
-    break;
-
-    case CUDPP_FLOAT:
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
-                                   numElements * sizeof(float)));
-
-        if (!plan->m_bKeysOnly)
-            CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
-                           numElements * sizeof(float)));
-
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
-                       WARP_SIZE * numBlocks * sizeof(float)));
-
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
-                       WARP_SIZE * numBlocks * sizeof(float)));
-
-        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
-                       WARP_SIZE * numBlocks * sizeof(float)));     
-    break;
-    }
-        
-    initDeviceParameters(plan);
-}
-
-/** @brief Deallocates intermediate memory from allocRadixSortStorage.
- *
- *
- * @param[in] plan Pointer to CUDPPRadixSortPlan object
-**/
-extern "C"
-void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
-{
-    CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
-    CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
-    CUDA_SAFE_CALL( cudaFree(plan->m_counters));
-    CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
-    CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
-}
-
-/** @brief Dispatch function to perform a sort on an array with 
- * a specified configuration.
- *
- * This is the dispatch routine which calls radixSort...() with 
- * appropriate template parameters and arguments as specified by 
- * the plan.
- * @param[in,out] keys Keys to be sorted.
- * @param[in,out] values Associated values to be sorted (through keys).
- * @param[in] numElements Number of elements in the sort.
- * @param[in] keyBits Number of interesting bits in the key*
- * @param[in] plan Configuration information for RadixSort.
-**/
-extern "C"
-void cudppRadixSortDispatch(void  *keys,
-                            void  *values,
-                            size_t numElements,
-                            int   keyBits,
-                            const CUDPPRadixSortPlan *plan)
-{              
-    if(plan->m_bKeysOnly)
-    {
-        switch(plan->m_config.datatype)
-        {
-        case CUDPP_UINT:
-            radixSortKeysOnly((uint*)keys, plan, false, 
-                              numElements, keyBits);
-            break;
-        case CUDPP_FLOAT:
-            radixSortFloatKeysOnly((float*)keys, plan, true,
-                                    numElements, keyBits);
-        }
-    }
-    else
-    {
-        switch(plan->m_config.datatype)
-        {
-        case CUDPP_UINT:      
-            radixSort((uint*)keys, (uint*) values, plan, 
-                      numElements, false, keyBits);
-            break;
-        case CUDPP_FLOAT: 
-            radixSortFloatKeys((float*)keys, (uint*) values, plan, 
-                               numElements, true, keyBits);
-        }
-    }
-}                            
-
-/** @} */ // end radixsort functions
-/** @} */ // end cudpp_app
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * radixsort_app.cu
+ *   
+ * @brief CUDPP application-level radix sorting routines
+ */
+
+/** @addtogroup cudpp_app 
+ * @{
+ */
+
+/** @name RadixSort Functions
+ * @{
+ */
+  
+
+#include "cudpp.h"
+#include "cudpp_util.h"
+#include "cudpp_radixsort.h"
+#include "cudpp_scan.h"
+#include "kernel/radixsort_kernel.cu"
+
+#include <cutil.h>
+#include <cstdlib>
+#include <cstdio>
+#include <assert.h>
+
+typedef unsigned int uint;
+
+/** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step, 
+* starting at startbit.
+* 
+* Uses cudppScanDispatch() for the prefix sum of radix counters.
+* 
+* @param[in,out] keys Keys to be sorted.
+* @param[in,out] values Associated values to be sorted (through keys).
+* @param[in] plan Configuration information for RadixSort.
+* @param[in] numElements Number of elements in the sort.
+**/
+template<uint nbits, uint startbit, bool flip, bool unflip>
+void radixSortStep(uint *keys, 
+                   uint *values, 
+                   const CUDPPRadixSortPlan *plan,
+                   uint numElements)
+{
+    const uint eltsPerBlock = SORT_CTA_SIZE * 4;
+    const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
+
+    bool fullBlocks = ((numElements % eltsPerBlock) == 0);
+    uint numBlocks = (fullBlocks) ? 
+        (numElements / eltsPerBlock) : 
+    (numElements / eltsPerBlock + 1);
+    uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
+        (numElements / eltsPerBlock2) : 
+    (numElements / eltsPerBlock2 + 1);
+
+    bool loop = numBlocks > 65535;
+    uint blocks = loop ? 65535 : numBlocks;
+    uint blocksFind = loop ? 65535 : numBlocks2;
+    uint blocksReorder = loop ? 65535 : numBlocks2;
+
+    uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
+
+    bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
+
+    if (persist)
+    {
+        loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
+        
+        blocks = numBlocks;
+        blocksFind = numBlocks2;
+        blocksReorder = numBlocks2;
+
+        // Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
+        // on GT200, resulting in better scheduling and lower run times
+        if (startbit > 0)
+        {
+            emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
+        }
+    }
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) : 
+                               numCTAs(radixSortBlocks<4, 0, true, false, true>);
+            }
+
+            radixSortBlocks<nbits, startbit, true, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+        else
+        {
+            radixSortBlocks<nbits, startbit, true, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) : 
+                                numCTAs(radixSortBlocks<4, 0, false, false, true>);
+            }
+
+            radixSortBlocks<nbits, startbit, false, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+        else
+        {
+            radixSortBlocks<nbits, startbit, false, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+    }
+
+    CUT_CHECK_ERROR("radixSortBlocks");
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, true, true>);
+            }
+            findRadixOffsets<startbit, true, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+        {
+            findRadixOffsets<startbit, true, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, false, true>);
+            }
+            findRadixOffsets<startbit, false, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+        {
+            findRadixOffsets<startbit, false, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+    }
+
+    CUT_CHECK_ERROR("findRadixOffsets");
+
+    cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
+
+    if (fullBlocks)
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
+                                             numCTAs(reorderData<0, true, true, false, true>);
+                }
+                reorderData<startbit, true, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, true, true, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
+                                             numCTAs(reorderData<0, true, false, false, true>);
+                }
+                reorderData<startbit, true, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, true, false, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+    }
+    else
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? 
+                        numCTAs(reorderData<0, false, true, true, true>) :
+                        numCTAs(reorderData<0, false, true, false, true>);
+                }
+                reorderData<startbit, false, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, false, true, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ?
+                        numCTAs(reorderData<0, false, false, true, true>) :
+                        numCTAs(reorderData<0, false, false, false, true>);
+                }
+                reorderData<startbit, false, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, false, false, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+    }
+
+    CUT_CHECK_ERROR("radixSortStep");
+}
+
+/**
+ * @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
+ * 
+ * @param[in,out] keys  Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param numElements Number of elements in the sort.
+**/
+template <bool flip>
+void radixSortSingleBlock(uint *keys, 
+                          uint *values, 
+                          uint numElements)
+{
+    bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
+    if (fullBlocks)
+    {
+        radixSortBlocks<32, 0, true, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)keys, (uint4*)values, 
+                 (uint4*)keys, (uint4*)values, 
+                 numElements, 0);
+    }
+    else
+    {
+        radixSortBlocks<32, 0, false, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)keys, (uint4*)values, 
+                 (uint4*)keys, (uint4*)values, 
+                 numElements, 0);
+    }
+
+    if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
+
+    CUT_CHECK_ERROR("radixSortSingleBlock");
+}
+
+/**
+ * @brief Main radix sort function
+ * 
+ * Main radix sort function.  Sorts in place in the keys and values arrays,
+ * but uses the other device arrays as temporary storage.  All pointer 
+ * parameters are device pointers.  Uses cudppScan() for the prefix sum of 
+ * radix counters.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] flipBits Is set true if key datatype is a float 
+ *                 (neg. numbers) for special float sorting operations.
+ * @param[in] keyBits Number of interesting bits in the key
+ **/
+void radixSort(uint *keys,                         
+               uint* values,               
+               const CUDPPRadixSortPlan *plan,               
+               size_t numElements,
+               bool flipBits,
+               int keyBits)
+{
+    if(numElements <= WARP_SIZE)
+    {
+        if (flipBits)
+            radixSortSingleWarp<true><<<1, numElements>>>
+                (keys, values, numElements);
+        else
+            radixSortSingleWarp<false><<<1, numElements>>>
+                (keys, values, numElements);
+
+        CUT_CHECK_ERROR("radixSortSingleWarp");        
+        return;
+    }
+#ifdef __DEVICE_EMULATION__
+    printf("bits: %d\n", keyBits);
+#endif
+    
+    if(numElements <= SORT_CTA_SIZE * 4)
+    {
+        if (flipBits)
+            radixSortSingleBlock<true>(keys, values, numElements);
+        else
+            radixSortSingleBlock<false>(keys, values, numElements);
+        return;
+    }
+        
+    // flip float bits on the first pass, unflip on the last pass    
+    if (flipBits) 
+    {               
+        radixSortStep<4,  0, true, false>
+            (keys, values, plan, numElements);            
+    }
+    else
+    {     
+        radixSortStep<4,  0, false, false>
+            (keys, values, plan, numElements);           
+    }
+
+    if (keyBits > 4)
+    {                   
+        radixSortStep<4,  4, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 8)
+    {                                   
+        radixSortStep<4,  8, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 12)
+    {                   
+        radixSortStep<4, 12, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 16)
+    {                   
+        radixSortStep<4, 16, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 20)
+    {                   
+        radixSortStep<4, 20, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 24)
+    {                   
+        radixSortStep<4, 24, false, false>
+            (keys, values, plan, numElements);         
+    }
+    if (keyBits > 28)
+    {
+        if (flipBits) // last pass
+        {                       
+            radixSortStep<4, 28, false, true>
+                (keys, values, plan, numElements);
+        }
+        else
+        {                       
+            radixSortStep<4, 28, false, false>
+                (keys, values, plan, numElements);            
+        }
+    }
+}
+
+/**
+ * @brief Wrapper to call main radix sort function. For float configuration.
+ * 
+ * Calls the main radix sort function. For float configuration.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] negativeKeys Is set true if key datatype has neg. numbers.
+ * @param[in] keyBits Number of interesting bits in the key
+ **/
+extern "C"
+void radixSortFloatKeys(float* keys, 
+                        uint* values, 
+                        const CUDPPRadixSortPlan *plan,
+                        size_t numElements,            
+                        bool  negativeKeys,
+                        int keyBits)
+{
+   
+    radixSort((uint*)keys, (uint*)values, plan, 
+              numElements, negativeKeys, keyBits);
+}
+
+/** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step, 
+ * starting at startbit.
+ * 
+ * @param[in,out] keys  Keys to be sorted.
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] numElements Number of elements in the sort. 
+**/
+template<uint nbits, uint startbit, bool flip, bool unflip>
+void radixSortStepKeysOnly(uint *keys, 
+                           const CUDPPRadixSortPlan *plan,
+                           uint numElements)
+{
+    const uint eltsPerBlock = SORT_CTA_SIZE * 4;
+    const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
+
+    bool fullBlocks = ((numElements % eltsPerBlock) == 0);
+    uint numBlocks = (fullBlocks) ? 
+        (numElements / eltsPerBlock) : 
+    (numElements / eltsPerBlock + 1);
+    uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
+        (numElements / eltsPerBlock2) : 
+    (numElements / eltsPerBlock2 + 1);
+
+    bool loop = numBlocks > 65535;
+    
+    uint blocks = loop ? 65535 : numBlocks;
+    uint blocksFind = loop ? 65535 : numBlocks2;
+    uint blocksReorder = loop ? 65535 : numBlocks2;
+
+    uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
+
+    bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
+
+    if (persist)
+    {
+        loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
+        
+        blocks = numBlocks;
+        blocksFind = numBlocks2;
+        blocksReorder = numBlocks2;
+    }
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) : 
+                                numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
+            }
+
+            radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+        }
+        else
+            radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) : 
+                                numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
+            }
+
+            radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+        }
+        else
+            radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+
+    }
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, true, true>);
+            }
+            findRadixOffsets<startbit, true, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+            findRadixOffsets<startbit, true, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, false, true>);
+            }
+            findRadixOffsets<startbit, false, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+            findRadixOffsets<startbit, false, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+
+    }
+
+    cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
+
+    if (fullBlocks)
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? 
+                        numCTAs(reorderDataKeysOnly<0, true, true, true, true>) : 
+                        numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
+                }
+                reorderDataKeysOnly<startbit, true, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, true, true, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                     numElements, numBlocks2);
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ?
+                        numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
+                        numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
+                }
+                reorderDataKeysOnly<startbit, true, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, true, false, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                     numElements, numBlocks2);
+        }
+    }
+    else
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? 
+                        numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
+                        numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
+                }
+                reorderDataKeysOnly<startbit, false, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, false, true, unflip, false>
+                <<<blocksReorder, SORT_CTA_SIZE>>>
+                (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                numElements, numBlocks2);
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ?
+                        numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
+                        numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
+                }
+                reorderDataKeysOnly<startbit, false, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, false, false, unflip, false>
+                <<<blocksReorder, SORT_CTA_SIZE>>>
+                (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                numElements, numBlocks2);
+        }
+    }
+
+    CUT_CHECK_ERROR("radixSortStepKeysOnly");
+}
+
+/**
+ * @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param numElements Number of elements in the sort.
+**/
+template <bool flip>
+void radixSortSingleBlockKeysOnly(uint *keys, 
+                                  uint numElements)
+{
+    bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
+    if (fullBlocks)
+    {
+        radixSortBlocksKeysOnly<32, 0, true, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+            ((uint4*)keys, (uint4*)keys, numElements, 1 );
+    }
+    else
+    {
+        radixSortBlocksKeysOnly<32, 0, false, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+            ((uint4*)keys, (uint4*)keys, numElements, 1 );
+    }
+
+    if (flip)
+        unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
+
+
+    CUT_CHECK_ERROR("radixSortSingleBlock");
+}
+
+/** 
+ * @brief Main radix sort function. For keys only configuration.
+ *
+ * Main radix sort function.  Sorts in place in the keys array,
+ * but uses the other device arrays as temporary storage.  All pointer 
+ * parameters are device pointers.  Uses scan for the prefix sum of
+ * radix counters.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] flipBits Is set true if key datatype is a float (neg. numbers) 
+ *        for special float sorting operations.
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] keyBits Number of interesting bits in the key
+**/
+extern "C"
+void radixSortKeysOnly(uint *keys,
+                       const CUDPPRadixSortPlan *plan, 
+                       bool flipBits, 
+                       size_t numElements,
+                       int keyBits)
+{
+
+    if(numElements <= WARP_SIZE)
+    {
+        if (flipBits)
+            radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
+        else
+            radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
+        return;
+    }
+    if(numElements <= SORT_CTA_SIZE * 4)
+    {
+        if (flipBits)
+            radixSortSingleBlockKeysOnly<true>(keys, numElements);
+        else
+            radixSortSingleBlockKeysOnly<false>(keys, numElements);
+        return;
+    }
+
+    // flip float bits on the first pass, unflip on the last pass
+    if (flipBits) 
+    {
+        radixSortStepKeysOnly<4,  0, true, false>(keys, plan, numElements);
+    }
+    else
+    {
+        radixSortStepKeysOnly<4,  0, false, false>(keys, plan, numElements);
+    }
+
+    if (keyBits > 4)
+    {
+        radixSortStepKeysOnly<4,  4, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 8)
+    {
+        radixSortStepKeysOnly<4,  8, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 12)
+    {
+        radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 16)
+    {
+        radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 20)
+    {
+        radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 24)
+    {
+       radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 28)
+    {
+        if (flipBits) // last pass
+        {
+            radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
+        }
+        else
+        {
+            radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
+        }
+    }
+}
+
+/**
+ * @brief Wrapper to call main radix sort function. For floats and keys only.
+ *
+ * Calls the radixSortKeysOnly function setting parameters for floats.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] negativeKeys Is set true if key flipBits is to be true in 
+ *                     radixSortKeysOnly().
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] keyBits Number of interesting bits in the key
+**/
+extern "C"
+void radixSortFloatKeysOnly(float *keys, 
+                            const CUDPPRadixSortPlan *plan,                        
+                            bool  negativeKeys,
+                            size_t numElements,
+                            int keyBits)
+{
+    radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
+}
+
+extern "C"
+void initDeviceParameters(CUDPPRadixSortPlan *plan)
+{
+    int deviceID = -1;
+    if (cudaSuccess == cudaGetDevice(&deviceID))
+    {
+        cudaDeviceProp devprop;
+        cudaGetDeviceProperties(&devprop, deviceID);
+
+        int smVersion = devprop.major * 10 + devprop.minor;
+
+        // sm_12 and later devices don't need help with coalesce in reorderData kernel
+        plan->m_bManualCoalesce = (smVersion < 12);
+
+        // sm_20 and later devices are better off not using persistent CTAs
+        plan->m_bUsePersistentCTAs = (smVersion < 20);
+
+        if (plan->m_bUsePersistentCTAs)
+        {
+            // The following is only true on pre-sm_20 devices (pre-Fermi):
+            // Empirically we have found that for some (usually larger) sort
+            // sizes it is better to use exactly as many "persistent" CTAs 
+            // as can fill the GPU, which loop over the "blocks" of work. For smaller 
+            // arrays it is better to use the typical CUDA approach of launching one CTA
+            // per block of work.
+            // 0-element of these two-element arrays is for key-value sorts
+            // 1-element is for key-only sorts
+            plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
+            plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
+            plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
+            plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
+
+            // create a map of function pointers to register counts for more accurate occupancy calculation
+            // Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
+            // Note we only insert the "loop" version of the kernels (the one with the last template param = true)
+            // Because those are the only ones that require persistent CTAs that maximally fill the device.
+            computeNumCTAs(radixSortBlocks<4, 0, false, false, true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocks<4, 0, false, true,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocks<4, 0, true, false,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocks<4, 0, true, true,  true>,          4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>,   4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+
+            computeNumCTAs(findRadixOffsets<0, false, true>,                  3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(findRadixOffsets<0, true, true>,                   3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+
+            computeNumCTAs(reorderData<0, false, false, false, true>,         0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, false, false, true, true>,          0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, false, true, false, true>,          0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, false, true, true, true>,           0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, false, false, true>,          0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, false, true, true>,           0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, true, false, true>,           0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, true, true, true>,            0,                                SORT_CTA_SIZE);
+
+            computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>,  0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>,  0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>,   0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>,  0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>,   0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>,   0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>,    0,                                SORT_CTA_SIZE);
+                   
+            computeNumCTAs(emptyKernel,                                       0,                                SORT_CTA_SIZE);
+        }
+    }
+}
+
+/**
+ * @brief From the programmer-specified sort configuration, 
+ *        creates internal memory for performing the sort.
+ * 
+ * @param[in] plan Pointer to CUDPPRadixSortPlan object
+**/
+extern "C"
+void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
+{               
+        
+    unsigned int numElements = plan->m_numElements;
+
+    unsigned int numBlocks = 
+        ((numElements % (SORT_CTA_SIZE * 4)) == 0) ? 
+            (numElements / (SORT_CTA_SIZE * 4)) : 
+            (numElements / (SORT_CTA_SIZE * 4) + 1);
+                        
+    switch(plan->m_config.datatype)
+    {
+    case CUDPP_UINT:
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys, 
+                                  numElements * sizeof(unsigned int)));
+
+        if (!plan->m_bKeysOnly)
+            CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues, 
+                           numElements * sizeof(unsigned int)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters, 
+                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
+                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets, 
+                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
+    break;
+
+    case CUDPP_FLOAT:
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
+                                   numElements * sizeof(float)));
+
+        if (!plan->m_bKeysOnly)
+            CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
+                           numElements * sizeof(float)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
+                       WARP_SIZE * numBlocks * sizeof(float)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
+                       WARP_SIZE * numBlocks * sizeof(float)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
+                       WARP_SIZE * numBlocks * sizeof(float)));     
+    break;
+    }
+        
+    initDeviceParameters(plan);
+}
+
+/** @brief Deallocates intermediate memory from allocRadixSortStorage.
+ *
+ *
+ * @param[in] plan Pointer to CUDPPRadixSortPlan object
+**/
+extern "C"
+void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
+{
+    CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
+    CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
+    CUDA_SAFE_CALL( cudaFree(plan->m_counters));
+    CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
+    CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
+}
+
+/** @brief Dispatch function to perform a sort on an array with 
+ * a specified configuration.
+ *
+ * This is the dispatch routine which calls radixSort...() with 
+ * appropriate template parameters and arguments as specified by 
+ * the plan.
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] keyBits Number of interesting bits in the key*
+ * @param[in] plan Configuration information for RadixSort.
+**/
+extern "C"
+void cudppRadixSortDispatch(void  *keys,
+                            void  *values,
+                            size_t numElements,
+                            int   keyBits,
+                            const CUDPPRadixSortPlan *plan)
+{              
+    if(plan->m_bKeysOnly)
+    {
+        switch(plan->m_config.datatype)
+        {
+        case CUDPP_UINT:
+            radixSortKeysOnly((uint*)keys, plan, false, 
+                              numElements, keyBits);
+            break;
+        case CUDPP_FLOAT:
+            radixSortFloatKeysOnly((float*)keys, plan, true,
+                                    numElements, keyBits);
+        }
+    }
+    else
+    {
+        switch(plan->m_config.datatype)
+        {
+        case CUDPP_UINT:      
+            radixSort((uint*)keys, (uint*) values, plan, 
+                      numElements, false, keyBits);
+            break;
+        case CUDPP_FLOAT: 
+            radixSortFloatKeys((float*)keys, (uint*) values, plan, 
+                               numElements, true, keyBits);
+        }
+    }
+}                            
+
+/** @} */ // end radixsort functions
+/** @} */ // end cudpp_app
diff --git a/lib/gpu/cudpp_mini/sharedmem.h b/lib/gpu/cudpp_mini/sharedmem.h
index 77f92adfed..87d4669d91 100644
--- a/lib/gpu/cudpp_mini/sharedmem.h
+++ b/lib/gpu/cudpp_mini/sharedmem.h
@@ -7,7 +7,7 @@
 // This source code is distributed under the terms of license.txt 
 // in the root directory of this source distribution.
 // ------------------------------------------------------------- 
-
+ 
 /**
  * @file
  * sharedmem.h
diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp
index 444f4e80bd..5cdc52a4f0 100644
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp
index 0300e69721..89ad45557f 100644
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt
index f01f03ca0a..77e0a073c7 100644
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@@ -1 +1,2 @@
 Geryon Version 10.280
+ 
\ No newline at end of file
diff --git a/lib/gpu/geryon/file_to_cstr.sh b/lib/gpu/geryon/file_to_cstr.sh
index e8264dfad8..a60d4f0c19 100755
--- a/lib/gpu/geryon/file_to_cstr.sh
+++ b/lib/gpu/geryon/file_to_cstr.sh
@@ -6,7 +6,7 @@
 # requires: sed, rm, mv
 #
 # Author: Axel Kohlmeyer, Temple University
-
+ 
 num_args=$#
 
 # we write to a scratch file, since
diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h
index 2187385077..e5c2d801d0 100644
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/nvc_macros.h b/lib/gpu/geryon/nvc_macros.h
index 3fb488072c..b9a4e8d6f3 100644
--- a/lib/gpu/geryon/nvc_macros.h
+++ b/lib/gpu/geryon/nvc_macros.h
@@ -6,7 +6,7 @@
 #undef _GLIBCXX_ATOMIC_BUILTINS
 #endif // _GLIBCXX_ATOMIC_BUILTINS
 #endif // __APPLE__
-
+ 
 #include <stdio.h>
 #include <cassert>
 #include <cuda_runtime.h>
diff --git a/lib/gpu/geryon/nvc_texture.h b/lib/gpu/geryon/nvc_texture.h
index 939f385340..4a7cb7bbdb 100644
--- a/lib/gpu/geryon/nvc_texture.h
+++ b/lib/gpu/geryon/nvc_texture.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef NVC_TEXTURE
 #define NVC_TEXTURE
 
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index f1eb104c76..b407c1ede3 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef NVD_DEVICE
 #define NVD_DEVICE
 
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index 1f53cfaadb..fd79863e1b 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef NVD_KERNEL
 #define NVD_KERNEL
 
diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h
index 4e88fd3201..8cedf5c212 100644
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@@ -1,6 +1,6 @@
 #ifndef NVD_MACROS_H
 #define NVD_MACROS_H
-
+ 
 #include <stdio.h>
 #include <cassert>
 #include <cuda.h>
diff --git a/lib/gpu/geryon/nvd_mat.h b/lib/gpu/geryon/nvd_mat.h
index ed42305a70..a6b989a698 100644
--- a/lib/gpu/geryon/nvd_mat.h
+++ b/lib/gpu/geryon/nvd_mat.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 /*! \file */
    
 #ifndef NVD_MAT_H
diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h
index 2bb6762370..41d656b8cf 100644
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef NVD_MEMORY_H
 #define NVD_MEMORY_H
 
diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h
index 3fbf80180b..5e5589d7a7 100644
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef NVD_TEXTURE
 #define NVD_TEXTURE
 
diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h
index cf7cf6c572..e068e53d8f 100644
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef NVD_TIMER_H
 #define NVD_TIMER_H
 
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 8ef5f32454..97cbf7f1e7 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef OCL_DEVICE
 #define OCL_DEVICE
 
diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h
index 4e3019b62e..4a02f848a6 100644
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h
index 180b292d3b..ee21f41aca 100644
--- a/lib/gpu/geryon/ocl_mat.h
+++ b/lib/gpu/geryon/ocl_mat.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 /*! \file */
    
 #ifndef OCL_MAT_H
diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h
index 8e72c51730..6afb98c545 100644
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef OCL_TEXTURE
 #define OCL_TEXTURE
 
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 649076c1e9..072c2f212d 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -20,7 +20,7 @@
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-
+ 
 #ifndef OCL_TIMER_H
 #define OCL_TIMER_H
 
diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h
index 78ec66ddc9..208f9c824f 100644
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h
index 844071c9b5..ea54954149 100644
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h
index c201cc0b12..aba52ddc48 100644
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h
index 115d7a6dd6..c0531b2f29 100644
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index 1873642181..45c94bee82 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_get_devices.cpp b/lib/gpu/geryon/ucl_get_devices.cpp
index 1fa758fb46..0797c718dc 100644
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index bfd4a6ce99..51593cfa23 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index fb60e8cf17..ca1dd12a47 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index a162b32c2a..1ea9175e3a 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_print.h b/lib/gpu/geryon/ucl_print.h
index 0152764225..1862f5be73 100644
--- a/lib/gpu/geryon/ucl_print.h
+++ b/lib/gpu/geryon/ucl_print.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h
index 9dabf16687..bfd1f076eb 100644
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
-
+ 
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp
index eba26018e4..6411b20853 100644
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp
index 0078e1ecf3..d365d71044 100644
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/lj96_cut_gpu_memory.h b/lib/gpu/lj96_cut_gpu_memory.h
index 214a951c76..483ef05570 100644
--- a/lib/gpu/lj96_cut_gpu_memory.h
+++ b/lib/gpu/lj96_cut_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp
index 55454022f7..020aeaa3ac 100644
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp
index deb8b264c0..23b2fcf6d0 100644
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/lj_cut_gpu_memory.h b/lib/gpu/lj_cut_gpu_memory.h
index b03486bda2..123b739649 100644
--- a/lib/gpu/lj_cut_gpu_memory.h
+++ b/lib/gpu/lj_cut_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/ljc_cut_gpu.cpp b/lib/gpu/ljc_cut_gpu.cpp
index 784ab38633..7943d20e63 100644
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp
index ec2001aa4a..d63ed6e5d9 100644
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/ljc_cut_gpu_memory.h b/lib/gpu/ljc_cut_gpu_memory.h
index 2d50bd6d16..4dedce957a 100644
--- a/lib/gpu/ljc_cut_gpu_memory.h
+++ b/lib/gpu/ljc_cut_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/ljcl_cut_gpu.cpp b/lib/gpu/ljcl_cut_gpu.cpp
index 1861350596..6fda0566a7 100644
--- a/lib/gpu/ljcl_cut_gpu.cpp
+++ b/lib/gpu/ljcl_cut_gpu.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp
index 21716c183d..a126309a92 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/ljcl_cut_gpu_memory.h b/lib/gpu/ljcl_cut_gpu_memory.h
index 59379cb4c8..056ba0e41f 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.h
+++ b/lib/gpu/ljcl_cut_gpu_memory.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index 6e86a9b64e..b158d67dfa 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/pair_gpu_balance.h b/lib/gpu/pair_gpu_balance.h
index 41640002f3..5d7b74dac9 100644
--- a/lib/gpu/pair_gpu_balance.h
+++ b/lib/gpu/pair_gpu_balance.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 0262b3dcd6..ab03e814b6 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index e7a78328d9..ed2baa0cc6 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp
index 63048b7560..123fbe54aa 100644
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
                          Peng Wang (Nvidia), penwang@nvidia.com
diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h
index 2af36f3bc2..554466a2fa 100644
--- a/lib/gpu/pair_gpu_precision.h
+++ b/lib/gpu/pair_gpu_precision.h
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/pair_win_sort.cpp b/lib/gpu/pair_win_sort.cpp
index 9e4e46cc95..4843a7ca17 100644
--- a/lib/gpu/pair_win_sort.cpp
+++ b/lib/gpu/pair_win_sort.cpp
@@ -10,7 +10,7 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
-
+ 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */