(keys);
+
+ // Twiddle bits if necessary
+ #pragma unroll
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+ {
+ unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+ }
+
+ // Radix sorting passes
+ while (true)
+ {
+ // Rank the blocked keys
+ int ranks[ITEMS_PER_THREAD];
+ BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+ begin_bit += RADIX_BITS;
+
+ __syncthreads();
+
+ // Check if this is the last pass
+ if (begin_bit >= end_bit)
+ {
+ // Last pass exchanges keys through shared memory in striped arrangement
+ BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
+
+ __syncthreads();
+
+ // Last pass exchanges through shared memory in striped arrangement
+ BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToStriped(values, ranks);
+
+ // Quit
+ break;
+ }
+
+ // Exchange keys through shared memory in blocked arrangement
+ BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+ __syncthreads();
+
+ // Exchange values through shared memory in blocked arrangement
+ BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
+
+ __syncthreads();
+ }
+
+ // Untwiddle bits if necessary
+ #pragma unroll
+ for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+ {
+ unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+ }
+ }
+
+
+ //@} end member group
+
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_raking_layout.cuh b/lib/kokkos/TPL/cub/block/block_raking_layout.cuh
new file mode 100755
index 0000000000..878a786cd9
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_raking_layout.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for raking across thread block data. 
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items. Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T The data type to be exchanged.
+ * \tparam BLOCK_THREADS The thread block size in threads.
+ * \tparam BLOCK_STRIPS When strip-mining, the number of threadblock-strips per tile
+ */
+template <
+ typename T,
+ int BLOCK_THREADS,
+ int BLOCK_STRIPS = 1>
+struct BlockRakingLayout
+{
+ //---------------------------------------------------------------------
+ // Constants and typedefs
+ //---------------------------------------------------------------------
+
+ enum
+ {
+ /// The total number of elements that need to be cooperatively reduced
+ SHARED_ELEMENTS =
+ BLOCK_THREADS * BLOCK_STRIPS,
+
+ /// Maximum number of warp-synchronous raking threads
+ MAX_RAKING_THREADS =
+ CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
+
+ /// Number of raking elements per warp-synchronous raking thread (rounded up)
+ SEGMENT_LENGTH =
+ (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+ /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+ RAKING_THREADS =
+ (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+ /// Pad each segment length with one element if it evenly divides the number of banks
+ SEGMENT_PADDING =
+ (PtxArchProps::SMEM_BANKS % SEGMENT_LENGTH == 0) ? 1 : 0,
+
+ /// Total number of elements in the raking grid
+ GRID_ELEMENTS =
+ RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
+
+ /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the warp size)
+ UNGUARDED =
+ (SHARED_ELEMENTS % RAKING_THREADS == 0),
+ };
+
+
+ /**
+ * \brief Shared memory storage type
+ */
+ typedef T TempStorage[BlockRakingLayout::GRID_ELEMENTS];
+
+
+ /**
+ * \brief Returns the location for the calling thread to place data into the grid
+ */
+ static __device__ __forceinline__ T* PlacementPtr(
+ TempStorage &temp_storage,
+ int linear_tid,
+ int block_strip = 0)
+ {
+ // Offset for partial
+ unsigned int offset = (block_strip * BLOCK_THREADS) + linear_tid;
+
+ // Add in one padding element for every segment
+ if (SEGMENT_PADDING > 0)
+ {
+ offset += offset / SEGMENT_LENGTH;
+ }
+
+ // Incorporating a block of padding partials every shared memory segment
+ return temp_storage + offset;
+ }
+
+
+ /**
+ * \brief Returns the location for the calling thread to begin sequential raking
+ */
+ static __device__ __forceinline__ T* RakingPtr(
+ TempStorage &temp_storage,
+ int linear_tid)
+ {
+ return temp_storage + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
+ }
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_reduce.cuh b/lib/kokkos/TPL/cub/block/block_reduce.cuh
new file mode 100755
index 0000000000..ffdff73775
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_reduce.cuh
@@ -0,0 +1,563 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA threadblock.
+ */
+enum BlockReduceAlgorithm
+{
+
+ /**
+ * \par Overview
+ * An efficient "raking" reduction algorithm. Execution is comprised of
+ * three phases:
+ * -# Upsweep sequential reduction in registers (if threads contribute more
+ * than one input each). Each thread then places the partial reduction
+ * of its item(s) into shared memory.
+ * -# Upsweep sequential reduction in shared memory. Threads within a
+ * single warp rake across segments of shared partial reductions.
+ * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+ *
+ * \par
+ * \image html block_reduce.png
+ * \p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ *
+ * \par Performance Considerations
+ * - Although this variant may suffer longer turnaround latencies when the
+ * GPU is under-occupied, it can often provide higher overall throughput
+ * across the GPU when suitably occupied.
+ */
+ BLOCK_REDUCE_RAKING,
+
+
+ /**
+ * \par Overview
+ * A quick "tiled warp-reductions" reduction algorithm. Execution is
+ * comprised of four phases:
+ * -# Upsweep sequential reduction in registers (if threads contribute more
+ * than one input each). Each thread then places the partial reduction
+ * of its item(s) into shared memory.
+ * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+ * reduction within each warp.
+ * -# A propagation phase where the warp reduction outputs in each warp are
+ * updated with the aggregate from each preceding warp.
+ *
+ * \par
+ * \image html block_scan_warpscans.png
+ * \p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ *
+ * \par Performance Considerations
+ * - Although this variant may suffer lower overall throughput across the
+ * GPU because due to a heavy reliance on inefficient warp-reductions, it
+ * can often provide lower turnaround latencies when the GPU is
+ * under-occupied.
+ */
+ BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. 
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * A reduction (or fold)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par
+ * Optionally, BlockReduce can be specialized by algorithm to accommodate different latency/throughput workload profiles:
+ * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
+ * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \tparam T Data type being reduced
+ * \tparam BLOCK_THREADS The thread block size in threads
+ * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_RAKING)
+ *
+ * \par Performance Considerations
+ * - Very efficient (only one synchronization barrier).
+ * - Zero bank conflicts for most types.
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ * - Summation (vs. generic reduction)
+ * - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ * - Every thread has a valid input (i.e., full vs. partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockReduce for 128 threads on type int
+ * typedef cub::BlockReduce BlockReduce;
+ *
+ * // Allocate shared memory for BlockReduce
+ * __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Compute the block-wide sum for thread0
+ * int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+ typename T,
+ int BLOCK_THREADS,
+ BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_RAKING>
+class BlockReduce
+{
+private:
+
+ /******************************************************************************
+ * Constants and typedefs
+ ******************************************************************************/
+
+ /// Internal specialization.
+ typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+ BlockReduceWarpReductions,
+ BlockReduceRaking >::Type InternalBlockReduce;
+
+ /// Shared memory storage layout type for BlockReduce
+ typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+ /******************************************************************************
+ * Utility methods
+ ******************************************************************************/
+
+ /// Internal storage allocator
+ __device__ __forceinline__ _TempStorage& PrivateStorage()
+ {
+ __shared__ _TempStorage private_storage;
+ return private_storage;
+ }
+
+
+ /******************************************************************************
+ * Thread fields
+ ******************************************************************************/
+
+ /// Shared storage reference
+ _TempStorage &temp_storage;
+
+ /// Linear thread-id
+ int linear_tid;
+
+
+public:
+
+ /// \smemstorage{BlockReduce}
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ /******************************************************************//**
+ * \name Collective constructors
+ *********************************************************************/
+ //@{
+
+ /**
+ * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using threadIdx.x.
+ */
+ __device__ __forceinline__ BlockReduce()
+ :
+ temp_storage(PrivateStorage()),
+ linear_tid(threadIdx.x)
+ {}
+
+
+ /**
+ * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using threadIdx.x.
+ */
+ __device__ __forceinline__ BlockReduce(
+ TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(threadIdx.x)
+ {}
+
+
+ /**
+ * \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
+ */
+ __device__ __forceinline__ BlockReduce(
+ int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ :
+ temp_storage(PrivateStorage()),
+ linear_tid(linear_tid)
+ {}
+
+
+ /**
+ * \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
+ */
+ __device__ __forceinline__ BlockReduce(
+ TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
+ int linear_tid) ///< [in] [optional] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(linear_tid)
+ {}
+
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Generic reductions
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element.
+ *
+ * The return value is undefined in threads other than thread0.
+ *
+ * Supports non-commutative reduction operators.
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a max reduction of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockReduce for 128 threads on type int
+ * typedef cub::BlockReduce BlockReduce;
+ *
+ * // Allocate shared memory for BlockReduce
+ * __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ * // Each thread obtains an input item
+ * int thread_data;
+ * ...
+ *
+ * // Compute the block-wide max for thread0
+ * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+ *
+ * \endcode
+ *
+ * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ T Reduce(
+ T input, ///< [in] Calling thread's input
+ ReductionOp reduction_op) ///< [in] Binary reduction operator
+ {
+ return InternalBlockReduce(temp_storage, linear_tid).template Reduce(input, BLOCK_THREADS, reduction_op);
+ }
+
+
+ /**
+ * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
+ *
+ * The return value is undefined in threads other than thread0.
+ *
+ * Supports non-commutative reduction operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a max reduction of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockReduce for 128 threads on type int
+ * typedef cub::BlockReduce BlockReduce;
+ *
+ * // Allocate shared memory for BlockReduce
+ * __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Compute the block-wide max for thread0
+ * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+ *
+ * \endcode
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ReductionOp>
+ __device__ __forceinline__ T Reduce(
+ T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment
+ ReductionOp reduction_op) ///< [in] Binary reduction operator
+ {
+ // Reduce partials
+ T partial = ThreadReduce(inputs, reduction_op);
+ return Reduce(partial, reduction_op);
+ }
+
+
+ /**
+ * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element.
+ *
+ * The return value is undefined in threads other than thread0.
+ *
+ * Supports non-commutative reduction operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(int num_valid, ...)
+ * {
+ * // Specialize BlockReduce for 128 threads on type int
+ * typedef cub::BlockReduce BlockReduce;
+ *
+ * // Allocate shared memory for BlockReduce
+ * __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ * // Each thread obtains an input item
+ * int thread_data;
+ * if (threadIdx.x < num_valid) thread_data = ...
+ *
+ * // Compute the block-wide max for thread0
+ * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+ *
+ * \endcode
+ *
+ * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ T Reduce(
+ T input, ///< [in] Calling thread's input
+ ReductionOp reduction_op, ///< [in] Binary reduction operator
+ int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+ {
+ // Determine if we scan skip bounds checking
+ if (num_valid >= BLOCK_THREADS)
+ {
+ return InternalBlockReduce(temp_storage, linear_tid).template Reduce(input, num_valid, reduction_op);
+ }
+ else
+ {
+ return InternalBlockReduce(temp_storage, linear_tid).template Reduce(input, num_valid, reduction_op);
+ }
+ }
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Summation reductions
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element.
+ *
+ * The return value is undefined in threads other than thread0.
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a sum reduction of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockReduce for 128 threads on type int
+ * typedef cub::BlockReduce BlockReduce;
+ *
+ * // Allocate shared memory for BlockReduce
+ * __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ * // Each thread obtains an input item
+ * int thread_data;
+ * ...
+ *
+ * // Compute the block-wide sum for thread0
+ * int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+ __device__ __forceinline__ T Sum(
+ T input) ///< [in] Calling thread's input
+ {
+ return InternalBlockReduce(temp_storage, linear_tid).template Sum(input, BLOCK_THREADS);
+ }
+
+ /**
+ * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
+ *
+ * The return value is undefined in threads other than thread0.
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockReduce for 128 threads on type int
+ * typedef cub::BlockReduce BlockReduce;
+ *
+ * // Allocate shared memory for BlockReduce
+ * __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Compute the block-wide sum for thread0
+ * int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ */
+ template
+ __device__ __forceinline__ T Sum(
+ T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
+ {
+ // Reduce partials
+ T partial = ThreadReduce(inputs, cub::Sum());
+ return Sum(partial);
+ }
+
+
+ /**
+ * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element.
+ *
+ * The return value is undefined in threads other than thread0.
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(int num_valid, ...)
+ * {
+ * // Specialize BlockReduce for 128 threads on type int
+ * typedef cub::BlockReduce BlockReduce;
+ *
+ * // Allocate shared memory for BlockReduce
+ * __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ * // Each thread obtains an input item (up to num_items)
+ * int thread_data;
+ * if (threadIdx.x < num_valid)
+ * thread_data = ...
+ *
+ * // Compute the block-wide sum for thread0
+ * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+ *
+ * \endcode
+ *
+ */
+ __device__ __forceinline__ T Sum(
+ T input, ///< [in] Calling thread's input
+ int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+ {
+ // Determine if we scan skip bounds checking
+ if (num_valid >= BLOCK_THREADS)
+ {
+ return InternalBlockReduce(temp_storage, linear_tid).template Sum(input, num_valid);
+ }
+ else
+ {
+ return InternalBlockReduce(temp_storage, linear_tid).template Sum(input, num_valid);
+ }
+ }
+
+
+ //@} end member group
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_scan.cuh b/lib/kokkos/TPL/cub/block/block_scan.cuh
new file mode 100755
index 0000000000..1c1a2dac81
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_scan.cuh
@@ -0,0 +1,2233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+ /**
+ * \par Overview
+ * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
+ * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
+ * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions.
+ * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+ * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+ * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+ *
+ * \par
+ * \image html block_scan_raking.png
+ * \p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ *
+ * \par Performance Considerations
+ * - Although this variant may suffer longer turnaround latencies when the
+ * GPU is under-occupied, it can often provide higher overall throughput
+ * across the GPU when suitably occupied.
+ */
+ BLOCK_SCAN_RAKING,
+
+
+ /**
+ * \par Overview
+ * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+ * the expense of higher register pressure. Raking threads preserve their
+ * "upsweep" segment of values in registers while performing warp-synchronous
+ * scan, allowing the "downsweep" not to re-read them from shared memory.
+ */
+ BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+ /**
+ * \par Overview
+ * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
+ * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
+ * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+ * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+ * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+ *
+ * \par
+ * \image html block_scan_warpscans.png
+ * \p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ *
+ * \par Performance Considerations
+ * - Although this variant may suffer lower overall throughput across the
+ * GPU because due to a heavy reliance on inefficient warpscans, it can
+ * often provide lower turnaround latencies when the GPU is under-occupied.
+ */
+ BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. 
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output list where each element is computed to be the reduction
+ * of the elements occurring earlier in the input list. Prefix sum
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the ith output reduction incorporates the ith input.
+ * The term \em exclusive indicates the ith input is not incorporated into
+ * the ith output reduction.
+ *
+ * \par
+ * Optionally, BlockScan can be specialized by algorithm to accommodate different latency/throughput workload profiles:
+ * -# cub::BLOCK_SCAN_RAKING. An efficient "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ * -# cub::BLOCK_SCAN_WARP_SCANS. A quick "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \tparam T Data type being scanned
+ * \tparam BLOCK_THREADS The thread block size in threads
+ * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix sum
+ * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }.
+ * The corresponding output \p thread_data in those threads will be
+ * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }.
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Uses only one or two block-wide synchronization barriers (depending on
+ * algorithm selection)
+ * - Zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ * - Prefix sum variants (vs. generic scan)
+ * - Exclusive variants (vs. inclusive)
+ * - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+ typename T,
+ int BLOCK_THREADS,
+ BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING>
+class BlockScan
+{
+private:
+
+ /******************************************************************************
+ * Constants and typedefs
+ ******************************************************************************/
+
+ /**
+ * Ensure the template parameterization meets the requirements of the
+ * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+ * cannot be used with threadblock sizes not a multiple of the
+ * architectural warp size.
+ */
+ static const BlockScanAlgorithm SAFE_ALGORITHM =
+ ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % PtxArchProps::WARP_THREADS != 0)) ?
+ BLOCK_SCAN_RAKING :
+ ALGORITHM;
+
+ /// Internal specialization.
+ typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+ BlockScanWarpScans,
+ BlockScanRaking >::Type InternalBlockScan;
+
+
+ /// Shared memory storage layout type for BlockScan
+ typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+ /******************************************************************************
+ * Thread fields
+ ******************************************************************************/
+
+ /// Shared storage reference
+ _TempStorage &temp_storage;
+
+ /// Linear thread-id
+ int linear_tid;
+
+
+ /******************************************************************************
+ * Utility methods
+ ******************************************************************************/
+
+ /// Internal storage allocator
+ __device__ __forceinline__ _TempStorage& PrivateStorage()
+ {
+ __shared__ _TempStorage private_storage;
+ return private_storage;
+ }
+
+
+public:
+
+ /// \smemstorage{BlockScan}
+ struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+ /******************************************************************//**
+ * \name Collective constructors
+ *********************************************************************/
+ //@{
+
+ /**
+ * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using threadIdx.x.
+ */
+ __device__ __forceinline__ BlockScan()
+ :
+ temp_storage(PrivateStorage()),
+ linear_tid(threadIdx.x)
+ {}
+
+
+ /**
+ * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using threadIdx.x.
+ */
+ __device__ __forceinline__ BlockScan(
+ TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(threadIdx.x)
+ {}
+
+
+ /**
+ * \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
+ */
+ __device__ __forceinline__ BlockScan(
+ int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ :
+ temp_storage(PrivateStorage()),
+ linear_tid(linear_tid)
+ {}
+
+
+ /**
+ * \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
+ */
+ __device__ __forceinline__ BlockScan(
+ TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
+ int linear_tid) ///< [in] [optional] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ :
+ temp_storage(temp_storage.Alias()),
+ linear_tid(linear_tid)
+ {}
+
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Exclusive prefix sum operations
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix sum
+ * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The
+ * corresponding output \p thread_data in those threads will be 0, 1, ..., 127.
+ *
+ */
+ __device__ __forceinline__ void ExclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output) ///< [out] Calling thread's output item (may be aliased to \p input)
+ {
+ T block_aggregate;
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The
+ * corresponding output \p thread_data in those threads will be 0, 1, ..., 127.
+ * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+ *
+ */
+ __device__ __forceinline__ void ExclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an exclusive prefix sum over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 128 integer items that are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total += block_aggregate;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockScan for 128 threads
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(0);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data = d_data[block_offset];
+ *
+ * // Collectively compute the block-wide exclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage).ExclusiveSum(
+ * thread_data, thread_data, block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * d_data[block_offset] = thread_data;
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, ....
+ * The corresponding output for the first segment will be 0, 1, ..., 127.
+ * The output for the second segment will be 128, 129, ..., 255. Furthermore,
+ * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
+ *
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template
+ __device__ __forceinline__ void ExclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate, block_prefix_op);
+ }
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Exclusive prefix sum operations (multiple data per thread)
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix sum
+ * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The
+ * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ */
+ template
+ __device__ __forceinline__ void ExclusiveSum(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input)
+ {
+ // Reduce consecutive thread items in registers
+ Sum scan_op;
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveSum(thread_partial, thread_partial);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The
+ * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }.
+ * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ */
+ template
+ __device__ __forceinline__ void ExclusiveSum(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ // Reduce consecutive thread items in registers
+ Sum scan_op;
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveSum(thread_partial, thread_partial, block_aggregate);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an exclusive prefix sum over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec4)
+ * across 128 threads where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total += block_aggregate;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+ * typedef cub::BlockLoad BlockLoad;
+ * typedef cub::BlockStore BlockStore;
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+ * __shared__ union {
+ * typename BlockLoad::TempStorage load;
+ * typename BlockScan::TempStorage scan;
+ * typename BlockStore::TempStorage store;
+ * } temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(0);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+ * __syncthreads();
+ *
+ * // Collectively compute the block-wide exclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage.scan).ExclusiveSum(
+ * thread_data, thread_data, block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+ * __syncthreads();
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, ....
+ * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511.
+ * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. Furthermore,
+ * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void ExclusiveSum(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ // Reduce consecutive thread items in registers
+ Sum scan_op;
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial);
+ }
+
+
+
+ //@} end member group // Inclusive prefix sums
+ /******************************************************************//**
+ * \name Exclusive prefix scan operations
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix max scan
+ * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The
+ * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126.
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T identity, ///< [in] Identity value
+ ScanOp scan_op) ///< [in] Binary scan operator
+ {
+ T block_aggregate;
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The
+ * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126.
+ * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input items
+ T &output, ///< [out] Calling thread's output items (may be aliased to \p input)
+ const T &identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an exclusive prefix max scan over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 128 integer items that are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockScan for 128 threads
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(INT_MIN);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data = d_data[block_offset];
+ *
+ * // Collectively compute the block-wide exclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage).ExclusiveScan(
+ * thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * d_data[block_offset] = thread_data;
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, ....
+ * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126.
+ * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. Furthermore,
+ * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
+ * scan, etc.
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ typename ScanOp,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_op);
+ }
+
+
+ //@} end member group // Inclusive prefix sums
+ /******************************************************************//**
+ * \name Exclusive prefix scan operations (multiple data per thread)
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix max scan
+ * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }.
+ * The corresponding output \p thread_data in those threads will be
+ * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ const T &identity, ///< [in] Identity value
+ ScanOp scan_op) ///< [in] Binary scan operator
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, identity, scan_op);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide exclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The
+ * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }.
+ * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ const T &identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an exclusive prefix max scan over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 128 integer items that are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+ * typedef cub::BlockLoad BlockLoad;
+ * typedef cub::BlockStore BlockStore;
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+ * __shared__ union {
+ * typename BlockLoad::TempStorage load;
+ * typename BlockScan::TempStorage scan;
+ * typename BlockStore::TempStorage store;
+ * } temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(0);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+ * __syncthreads();
+ *
+ * // Collectively compute the block-wide exclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage.scan).ExclusiveScan(
+ * thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+ * __syncthreads();
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, ....
+ * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510.
+ * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. Furthermore,
+ * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
+ * scan, etc.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ T identity, ///< [in] Identity value
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_op);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial);
+ }
+
+
+ //@} end member group
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+ /******************************************************************//**
+ * \name Exclusive prefix scan operations (identityless, single datum per thread)
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no identity value, the output computed for thread0 is undefined.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op) ///< [in] Binary scan operator
+ {
+ T block_aggregate;
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ typename ScanOp,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
+ }
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Exclusive prefix scan operations (identityless, multiple data per thread)
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no identity value, the output computed for thread0 is undefined.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ ScanOp scan_op) ///< [in] Binary scan operator
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+ }
+
+
+ /**
+ * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void ExclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
+
+ // Exclusive scan in registers with prefix
+ ThreadScanExclusive(input, output, scan_op, thread_partial);
+ }
+
+
+ //@} end member group
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+ /******************************************************************//**
+ * \name Inclusive prefix sum operations
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix sum
+ * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The
+ * corresponding output \p thread_data in those threads will be 1, 2, ..., 128.
+ *
+ */
+ __device__ __forceinline__ void InclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output) ///< [out] Calling thread's output item (may be aliased to \p input)
+ {
+ T block_aggregate;
+ InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The
+ * corresponding output \p thread_data in those threads will be 1, 2, ..., 128.
+ * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+ *
+ */
+ __device__ __forceinline__ void InclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
+ }
+
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an inclusive prefix sum over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 128 integer items that are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total += block_aggregate;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockScan for 128 threads
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(0);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data = d_data[block_offset];
+ *
+ * // Collectively compute the block-wide inclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage).InclusiveSum(
+ * thread_data, thread_data, block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * d_data[block_offset] = thread_data;
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, ....
+ * The corresponding output for the first segment will be 1, 2, ..., 128.
+ * The output for the second segment will be 129, 130, ..., 256. Furthermore,
+ * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
+ *
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template
+ __device__ __forceinline__ void InclusiveSum(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate, block_prefix_op);
+ }
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Inclusive prefix sum operations (multiple data per thread)
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix sum
+ * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The
+ * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ */
+ template
+ __device__ __forceinline__ void InclusiveSum(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input)
+ {
+ if (ITEMS_PER_THREAD == 1)
+ {
+ InclusiveSum(input[0], output[0]);
+ }
+ else
+ {
+ // Reduce consecutive thread items in registers
+ Sum scan_op;
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveSum(thread_partial, thread_partial);
+
+ // Inclusive scan in registers with prefix
+ ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+ }
+ }
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The
+ * corresponding output \p thread_data in those threads will be
+ * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }.
+ * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ void InclusiveSum(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ if (ITEMS_PER_THREAD == 1)
+ {
+ InclusiveSum(input[0], output[0], block_aggregate);
+ }
+ else
+ {
+ // Reduce consecutive thread items in registers
+ Sum scan_op;
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveSum(thread_partial, thread_partial, block_aggregate);
+
+ // Inclusive scan in registers with prefix
+ ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+ }
+ }
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an inclusive prefix sum over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec4)
+ * across 128 threads where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total += block_aggregate;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+ * typedef cub::BlockLoad BlockLoad;
+ * typedef cub::BlockStore BlockStore;
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+ * __shared__ union {
+ * typename BlockLoad::TempStorage load;
+ * typename BlockScan::TempStorage scan;
+ * typename BlockStore::TempStorage store;
+ * } temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(0);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+ * __syncthreads();
+ *
+ * // Collectively compute the block-wide inclusive prefix sum
+ * int block_aggregate;
+ * BlockScan(temp_storage.scan).IncluisveSum(
+ * thread_data, thread_data, block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+ * __syncthreads();
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, ....
+ * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512.
+ * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. Furthermore,
+ * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void InclusiveSum(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ if (ITEMS_PER_THREAD == 1)
+ {
+ InclusiveSum(input[0], output[0], block_aggregate, block_prefix_op);
+ }
+ else
+ {
+ // Reduce consecutive thread items in registers
+ Sum scan_op;
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
+
+ // Inclusive scan in registers with prefix
+ ThreadScanInclusive(input, output, scan_op, thread_partial);
+ }
+ }
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Inclusive prefix scan operations
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix max scan
+ * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The
+ * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126.
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ void InclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op) ///< [in] Binary scan operator
+ {
+ T block_aggregate;
+ InclusiveScan(input, output, scan_op, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+ * are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain input item for each thread
+ * int thread_data;
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The
+ * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126.
+ * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template
+ __device__ __forceinline__ void InclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate);
+ }
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an inclusive prefix max scan over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 128 integer items that are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockScan for 128 threads
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(INT_MIN);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data = d_data[block_offset];
+ *
+ * // Collectively compute the block-wide inclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage).InclusiveScan(
+ * thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * d_data[block_offset] = thread_data;
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, ....
+ * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126.
+ * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. Furthermore,
+ * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
+ * scan, etc.
+ *
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ typename ScanOp,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T input, ///< [in] Calling thread's input item
+ T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
+ }
+
+
+ //@} end member group
+ /******************************************************************//**
+ * \name Inclusive prefix scan operations (multiple data per thread)
+ *********************************************************************/
+ //@{
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix max scan
+ * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The
+ * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ ScanOp scan_op) ///< [in] Binary scan operator
+ {
+ if (ITEMS_PER_THREAD == 1)
+ {
+ InclusiveScan(input[0], output[0], scan_op);
+ }
+ else
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+ // Inclusive scan in registers with prefix
+ ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+ }
+ }
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+ * are partitioned in a [blocked arrangement](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ * // Specialize BlockScan for 128 threads on type int
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate shared memory for BlockScan
+ * __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ * // Obtain a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * ...
+ *
+ * // Collectively compute the block-wide inclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }.
+ * The corresponding output \p thread_data in those threads will be
+ * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }.
+ * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate) ///< [out] block-wide aggregate reduction of input items
+ {
+ if (ITEMS_PER_THREAD == 1)
+ {
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+ }
+ else
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+ // Inclusive scan in registers with prefix
+ ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+ }
+ }
+
+
+ /**
+ * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
+ *
+ * The \p block_prefix_op functor must implement a member function T operator()(T block_aggregate).
+ * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+ * The functor will be invoked by the first warp of threads in the block, however only the return value from
+ * lane0 is applied as the block-wide prefix. Can be stateful.
+ *
+ * Supports non-commutative scan operators.
+ *
+ * \blocked
+ *
+ * \smemreuse
+ *
+ * The code snippet below illustrates a single thread block that progressively
+ * computes an inclusive prefix max scan over multiple "tiles" of input using a
+ * prefix functor to maintain a running total between block-wide scans. Each tile consists
+ * of 128 integer items that are partitioned across 128 threads.
+ * \par
+ * \code
+ * #include
+ *
+ * // A stateful callback functor that maintains a running prefix to be applied
+ * // during consecutive scan operations.
+ * struct BlockPrefixOp
+ * {
+ * // Running prefix
+ * int running_total;
+ *
+ * // Constructor
+ * __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+ *
+ * // Callback operator to be entered by the first warp of threads in the block.
+ * // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+ * __device__ int operator()(int block_aggregate)
+ * {
+ * int old_prefix = running_total;
+ * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+ * return old_prefix;
+ * }
+ * };
+ *
+ * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+ * {
+ * // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+ * typedef cub::BlockLoad BlockLoad;
+ * typedef cub::BlockStore BlockStore;
+ * typedef cub::BlockScan BlockScan;
+ *
+ * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+ * __shared__ union {
+ * typename BlockLoad::TempStorage load;
+ * typename BlockScan::TempStorage scan;
+ * typename BlockStore::TempStorage store;
+ * } temp_storage;
+ *
+ * // Initialize running total
+ * BlockPrefixOp prefix_op(0);
+ *
+ * // Have the block iterate over segments of items
+ * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+ * {
+ * // Load a segment of consecutive items that are blocked across threads
+ * int thread_data[4];
+ * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+ * __syncthreads();
+ *
+ * // Collectively compute the block-wide inclusive prefix max scan
+ * int block_aggregate;
+ * BlockScan(temp_storage.scan).InclusiveScan(
+ * thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
+ * __syncthreads();
+ *
+ * // Store scanned items to output segment
+ * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+ * __syncthreads();
+ * }
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, ....
+ * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510.
+ * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. Furthermore,
+ * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
+ * scan, etc.
+ *
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b)
+ * \tparam BlockPrefixOp [inferred] Call-back functor type having member T operator()(T block_aggregate)
+ */
+ template <
+ int ITEMS_PER_THREAD,
+ typename ScanOp,
+ typename BlockPrefixOp>
+ __device__ __forceinline__ void InclusiveScan(
+ T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
+ T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input)
+ ScanOp scan_op, ///< [in] Binary scan operator
+ T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+ BlockPrefixOp &block_prefix_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+ {
+ if (ITEMS_PER_THREAD == 1)
+ {
+ InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_op);
+ }
+ else
+ {
+ // Reduce consecutive thread items in registers
+ T thread_partial = ThreadReduce(input, scan_op);
+
+ // Exclusive threadblock-scan
+ ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
+
+ // Inclusive scan in registers with prefix
+ ThreadScanInclusive(input, output, scan_op, thread_partial);
+ }
+ }
+
+ //@} end member group
+
+
+};
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_store.cuh b/lib/kokkos/TPL/cub/block/block_store.cuh
new file mode 100755
index 0000000000..fb990de1c7
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_store.cuh
@@ -0,0 +1,926 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_vector.cuh"
+#include "../thread/thread_store.cuh"
+#include "block_exchange.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked I/O
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER cub::PtxStoreModifier cache modifier.
+ * \tparam T [inferred] The data type to store.
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+ PtxStoreModifier MODIFIER,
+ typename T,
+ int ITEMS_PER_THREAD,
+ typename OutputIteratorRA>
+__device__ __forceinline__ void StoreBlocked(
+ int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+ // Store directly in thread-blocked order
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ ThreadStore(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+ }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER cub::PtxStoreModifier cache modifier.
+ * \tparam T [inferred] The data type to store.
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+ PtxStoreModifier MODIFIER,
+ typename T,
+ int ITEMS_PER_THREAD,
+ typename OutputIteratorRA>
+__device__ __forceinline__ void StoreBlocked(
+ int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int valid_items) ///< [in] Number of valid items to write
+{
+ // Store directly in thread-blocked order
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+ {
+ ThreadStore(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+ }
+ }
+}
+
+
+
+//@} end member group
+/******************************************************************//**
+ * \name Striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \striped
+ *
+ * \tparam MODIFIER cub::PtxStoreModifier cache modifier.
+ * \tparam BLOCK_THREADS The thread block size in threads
+ * \tparam T [inferred] The data type to store.
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+ PtxStoreModifier MODIFIER,
+ int BLOCK_THREADS,
+ typename T,
+ int ITEMS_PER_THREAD,
+ typename OutputIteratorRA>
+__device__ __forceinline__ void StoreStriped(
+ int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+ // Store directly in striped order
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ ThreadStore(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+ }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \striped
+ *
+ * \tparam MODIFIER cub::PtxStoreModifier cache modifier.
+ * \tparam BLOCK_THREADS The thread block size in threads
+ * \tparam T [inferred] The data type to store.
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+ PtxStoreModifier MODIFIER,
+ int BLOCK_THREADS,
+ typename T,
+ int ITEMS_PER_THREAD,
+ typename OutputIteratorRA>
+__device__ __forceinline__ void StoreStriped(
+ int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int valid_items) ///< [in] Number of valid items to write
+{
+ // Store directly in striped order
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+ {
+ ThreadStore(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+ }
+ }
+}
+
+
+
+//@} end member group
+/******************************************************************//**
+ * \name Warp-striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER cub::PtxStoreModifier cache modifier.
+ * \tparam T [inferred] The data type to store.
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+ PtxStoreModifier MODIFIER,
+ typename T,
+ int ITEMS_PER_THREAD,
+ typename OutputIteratorRA>
+__device__ __forceinline__ void StoreWarpStriped(
+ int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+ int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+ int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+ int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
+ // Store directly in warp-striped order
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ ThreadStore(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+ }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER cub::PtxStoreModifier cache modifier.
+ * \tparam T [inferred] The data type to store.
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+ PtxStoreModifier MODIFIER,
+ typename T,
+ int ITEMS_PER_THREAD,
+ typename OutputIteratorRA>
+__device__ __forceinline__ void StoreWarpStriped(
+ int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int valid_items) ///< [in] Number of valid items to write
+{
+ int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+ int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+ int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
+ // Store directly in warp-striped order
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ if (warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS) < valid_items)
+ {
+ ThreadStore(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+ }
+ }
+}
+
+
+
+//@} end member group
+/******************************************************************//**
+ * \name Blocked, vectorized I/O
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ * - \p ITEMS_PER_THREAD is odd
+ * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam MODIFIER cub::PtxStoreModifier cache modifier.
+ * \tparam T [inferred] The data type to store.
+ * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+ PtxStoreModifier MODIFIER,
+ typename T,
+ int ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreBlockedVectorized(
+ int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+ T *block_ptr, ///< [in] Input pointer for storing from
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+ enum
+ {
+ // Maximum CUDA vector size is 4 elements
+ MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+ // Vector size must be a power of two and an even divisor of the items per thread
+ VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+ MAX_VEC_SIZE :
+ 1,
+
+ VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+ };
+
+ // Vector type
+ typedef typename VectorHelper::Type Vector;
+
+ // Alias global pointer
+ Vector *block_ptr_vectors = reinterpret_cast(block_ptr);
+
+ // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+ Vector raw_vector[VECTORS_PER_THREAD];
+ T *raw_items = reinterpret_cast(raw_vector);
+
+ // Copy
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ raw_items[ITEM] = items[ITEM];
+ }
+
+ // Direct-store using vector types
+ StoreBlocked