git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12041 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2014-05-29 22:52:05 +00:00
parent 3545d44d0d
commit 3cfef73208
138 changed files with 52720 additions and 0 deletions

View File

@ -19,6 +19,8 @@ cuda NVIDIA GPU routines, USER-CUDA package
from Christian Trott (U Tech Ilmenau)
gpu general GPU routines, GPU package
from Mike Brown (ORNL)
kokkos Kokkos package for GPU and many-core acceleration
from Kokkos development team (Sandia)
linalg set of BLAS and LAPACK routines needed by USER-ATC package
from Axel Kohlmeyer (Temple U)
poems POEMS rigid-body integration package, POEMS package

104
lib/kokkos/Makefile.lammps Normal file
View File

@ -0,0 +1,104 @@
# Settings that the LAMMPS build will import when this package library is used
OMP = yes
CUDA = no
HWLOC = no
AVX = no
MIC = no
LIBRT = no
DEBUG = no
CUDA_PATH = /usr/local/cuda
KOKKOS_PATH = ../../lib/kokkos
kokkos_SYSINC = -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I../
SRC_KOKKOS = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
ifeq ($(CUDA), yes)
kokkos_SYSINC += -x cu -DDEVICE=2 -DKOKKOS_HAVE_CUDA
SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cu)
USRLIB += -L$(CUDA_PATH)/lib64 -lcudart -lcuda
ifeq ($(UVM), yes)
kokkos_SYSINC += -DKOKKOS_USE_UVM
endif
else
kokkos_SYSINC += -DDEVICE=1
endif
ifeq ($(CUSPARSE), yes)
kokkos_SYSINC += -DKOKKOS_USE_CUSPARSE
USRLIB += -lcusparse
endif
ifeq ($(CUBLAS), yes)
kokkos_SYSINC += -DKOKKOS_USE_CUBLAS
USRLIB += -lcublas
endif
ifeq ($(AVX), yes)
ifeq ($(CUDA), yes)
kokkos_SYSINC += -Xcompiler -mavx
else
kokkos_SYSINC += -mavx
endif
LINKFLAGS += -mavx
endif
ifeq ($(MIC), yes)
kokkos_SYSINC += -mmic
LINKFLAGS += -mmic
endif
ifeq ($(OMP),yes)
kokkos_SYSINC += -DKOKKOS_HAVE_OPENMP
SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
ifeq ($(CUDA), yes)
kokkos_SYSINC += -Xcompiler -fopenmp
else
kokkos_SYSINC += -fopenmp
endif
LINKFLAGS += -fopenmp
else
kokkos_SYSINC += -DKOKKOS_HAVE_PTHREAD
USRLIB += -lpthread
SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
endif
ifeq ($(HWLOC),yes)
kokkos_SYSINC += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
USRLIB += -L$(HWLOCPATH)/lib -lhwloc
endif
ifeq ($(RED_PREC), yes)
kokkos_SYSINC += --use_fast_math
endif
ifeq ($(DEBUG), yes)
kokkos_SYSINC += -g -G -DKOKKOS_EXPRESSION_CHECK -DENABLE_TRACEBACK
LINKFLAGS += -g
endif
ifeq ($(LIBRT),yes)
kokkos_SYSINC += -DKOKKOS_USE_LIBRT -DPREC_TIMER
USRLIB += -lrt
endif
ifeq ($(CUDALDG), yes)
kokkos_SYSINC += -DKOKKOS_USE_LDG_INTRINSIC
endif
OBJ_KOKKOS_TMP = $(SRC_KOKKOS:.cpp=.o)
OBJ_KOKKOS = $(OBJ_KOKKOS_TMP:.cu=.o)
OBJ_KOKKOS_LINK = $(notdir $(OBJ_KOKKOS))
override OBJ += kokkos_depend.o
libkokkoscore.a: $(OBJ_KOKKOS)
ar cr libkokkoscore.a $(OBJ_KOKKOS_LINK)
kokkos_depend.o: libkokkoscore.a
touch kokkos_depend.cpp
$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c kokkos_depend.cpp
kokkos_SYSLIB = -L./ $(LINKFLAGS) $(USRLIB)

45
lib/kokkos/README Normal file
View File

@ -0,0 +1,45 @@
Kokkos library
Carter Edwards, Christian Trott, others ??? (CT NOTE)
Sandia National Labs
CT NOTE: Version ???
27 May 2014
CT NOTE: Pointer to Kokkos web page???
-------------------------
This directory has source files from the Kokkos library that LAMMPS
uses when building with its KOKKOS package. The package contains
versions of pair, fix, and atom styles written with Kokkos data
structures and calls to the Kokkos library that should run efficiently
on various kinds of accelerated nodes, including GPU and many-core
chips.
Kokkos is a C++ library that provides two key abstractions for an
application like LAMMPS. First, it allows a single implementation of
an application kernel (e.g. a pair style) to run efficiently on
different kinds of hardware (GPU, Intel Phi, many-core chip).
Second, it adjusts the memory layout of basic data structures like 2d
and 3d arrays specifically for the chosen hardware. These are used in
LAMMPS to store atom coordinates or forces or neighbor lists. The
layout is chosen to optimize performance on different platforms.
Again this operation is hidden from the developer, and does not affect
how the single implementation of the kernel is coded.
CT NOTE: More details???
To build LAMMPS with Kokkos, you should not need to make any changes
to files in this directory. You can overrided defaults that are set
in Makefile.lammps when building LAMMPS, by defining variables as part
of the make command. Details of the build process with Kokkos are
explained in Section 2.3 of doc/Section_start.html. and in Section 5.9
of doc/Section_accelerate.html.
The one exception is that when using Kokkos with NVIDIA GPUs, the
CUDA_PATH setting in Makefile.lammps needs to point to the
installation of the Cuda software on your machine. The normal default
location is /usr/local/cuda. If this is not correct, you need to edit
Makefile.lammps.

View File

@ -0,0 +1,587 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
*/
#pragma once
#include "../util_type.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
* \ingroup BlockModule
*
* \par Overview
* A set of "head flags" (or "tail flags") is often used to indicate corresponding items
* that differ from their predecessors (or successors). For example, head flags are convenient
* for demarcating disjoint data segments as part of a segmented scan or reduction.
*
* \tparam T The data type to be flagged.
* \tparam BLOCK_THREADS The thread block size in threads.
*
* \par A Simple Example
* \blockcollective{BlockDiscontinuity}
* \par
* The code snippet below illustrates the head flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute head flags for discontinuities in the segment
* int head_flags[4];
* BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
* The corresponding output \p head_flags in those threads will be
* <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
*
* \par Performance Considerations
* - Zero bank conflicts for most types.
*
*/
template <
typename T,
int BLOCK_THREADS>
class BlockDiscontinuity
{
private:
/******************************************************************************
* Type definitions
******************************************************************************/
/// Shared memory storage layout type (last element from each thread's input)
typedef T _TempStorage[BLOCK_THREADS];
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Specialization for when FlagOp has third index param
template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
struct ApplyOp
{
// Apply flag operator
static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
{
return flag_op(a, b, idx);
}
};
/// Specialization for when FlagOp does not have a third index param
template <typename FlagOp>
struct ApplyOp<FlagOp, false>
{
// Apply flag operator
static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
{
return flag_op(a, b);
}
};
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
public:
/// \smemstorage{BlockDiscontinuity}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockDiscontinuity()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockDiscontinuity(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockDiscontinuity(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockDiscontinuity(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
//@} end member group
/******************************************************************//**
* \name Head flag operations
*********************************************************************/
//@{
/**
* \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
*
* The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* Furthermore, <tt>head_flags<sub><em>i</em></sub></tt> is always set for
* <tt>input><sub>0</sub></tt> in <em>thread</em><sub>0</sub>.
*
* \blocked
*
* \smemreuse
*
* The code snippet below illustrates the head-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute head flags for discontinuities in the segment
* int head_flags[4];
* BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
* The corresponding output \p head_flags in those threads will be
* <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share last item
temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
__syncthreads();
// Set flag for first item
head_flags[0] = (linear_tid == 0) ?
1 : // First thread
ApplyOp<FlagOp>::Flag(
flag_op,
temp_storage[linear_tid - 1],
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set head_flags for remaining items
#pragma unroll
for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
{
head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
flag_op,
input[ITEM - 1],
input[ITEM],
(linear_tid * ITEMS_PER_THREAD) + ITEM);
}
}
/**
* \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
*
* The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
* returns \p true (where <em>previous-item</em> is either the preceding item
* in the same thread or the last item in the previous thread).
* For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
* against \p tile_predecessor_item.
*
* \blocked
*
* \smemreuse
*
* The code snippet below illustrates the head-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Have thread0 obtain the predecessor item for the entire tile
* int tile_predecessor_item;
* if (threadIdx.x == 0) tile_predecessor_item == ...
*
* // Collectively compute head flags for discontinuities in the segment
* int head_flags[4];
* BlockDiscontinuity(temp_storage).FlagHeads(
* head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
* and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be
* <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_predecessor_item) ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
{
// Share last item
temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
__syncthreads();
// Set flag for first item
int predecessor = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage[linear_tid - 1];
head_flags[0] = ApplyOp<FlagOp>::Flag(
flag_op,
predecessor,
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set flag for remaining items
#pragma unroll
for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
{
head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
flag_op,
input[ITEM - 1],
input[ITEM],
(linear_tid * ITEMS_PER_THREAD) + ITEM);
}
}
//@} end member group
/******************************************************************//**
* \name Tail flag operations
*********************************************************************/
//@{
/**
* \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
*
* The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* Furthermore, <tt>tail_flags<sub>ITEMS_PER_THREAD-1</sub></tt> is always
* set for <em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub>.
*
* \blocked
*
* \smemreuse
*
* The code snippet below illustrates the tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute tail flags for discontinuities in the segment
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
* The corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagTails(
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first item
temp_storage[linear_tid] = input[0];
__syncthreads();
// Set flag for last item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp<FlagOp>::Flag(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
// Set flags for remaining items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
{
tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
flag_op,
input[ITEM],
input[ITEM + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEM);
}
}
/**
* \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
*
* The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
* <tt>input<sub><em>i</em></sub></tt> when
* <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
* returns \p true (where <em>next-item</em> is either the next item
* in the same thread or the first item in the next thread).
* For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
* <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
* against \p tile_predecessor_item.
*
* \blocked
*
* \smemreuse
*
* The code snippet below illustrates the tail-flagging of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockDiscontinuity for 128 threads on type int
* typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Have thread127 obtain the successor item for the entire tile
* int tile_successor_item;
* if (threadIdx.x == 127) tile_successor_item == ...
*
* // Collectively compute tail flags for discontinuities in the segment
* int tail_flags[4];
* BlockDiscontinuity(temp_storage).FlagTails(
* tail_flags, thread_data, cub::Inequality(), tile_successor_item);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is
* <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
* and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be
* <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam FlagT <b>[inferred]</b> The flag type (must be an integer type)
* \tparam FlagOp <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
__device__ __forceinline__ void FlagTails(
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_successor_item) ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
{
// Share first item
temp_storage[linear_tid] = input[0];
__syncthreads();
// Set flag for last item
int successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
// Set flags for remaining items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
{
tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
flag_op,
input[ITEM],
input[ITEM + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEM);
}
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,918 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
*/
#pragma once
#include "../util_arch.cuh"
#include "../util_macro.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
* \ingroup BlockModule
*
* \par Overview
* It is commonplace for blocks of threads to rearrange data items between
* threads. For example, the global memory subsystem prefers access patterns
* where data items are "striped" across threads (where consecutive threads access consecutive items),
* yet most block-wide operations prefer a "blocked" partitioning of items across threads
* (where consecutive items belong to a single thread).
*
* \par
* BlockExchange supports the following types of data exchanges:
* - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>striped</em>](index.html#sec5sec4) arrangements
* - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>warp-striped</em>](index.html#sec5sec4) arrangements
* - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec4)
* - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec4)
*
* \tparam T The data type to be exchanged.
* \tparam BLOCK_THREADS The thread block size in threads.
* \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread.
* \tparam WARP_TIME_SLICING <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
*
* \par A Simple Example
* \blockcollective{BlockExchange}
* \par
* The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Load a tile of data striped across threads
* int thread_data[4];
* cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
*
* // Collectively exchange data into a blocked arrangement across threads
* BlockExchange(temp_storage).StripedToBlocked(thread_data);
*
* \endcode
* \par
* Suppose the set of striped input \p thread_data across the block of threads is
* <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
* The corresponding output \p thread_data in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
* \par Performance Considerations
* - Proper device-specific padding ensures zero bank conflicts for most types.
*
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
bool WARP_TIME_SLICING = false>
class BlockExchange
{
private:
/******************************************************************************
* Constants
******************************************************************************/
enum
{
LOG_WARP_THREADS = PtxArchProps::LOG_WARP_THREADS,
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
LOG_SMEM_BANKS = PtxArchProps::LOG_SMEM_BANKS,
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
// Insert padding if the number of items per thread is a power of two
INSERT_PADDING = ((ITEMS_PER_THREAD & (ITEMS_PER_THREAD - 1)) == 0),
PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
};
/******************************************************************************
* Type definitions
******************************************************************************/
/// Shared memory storage layout type
typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
public:
/// \smemstorage{BlockExchange}
struct TempStorage : Uninitialized<_TempStorage> {};
private:
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
int warp_lane;
int warp_id;
int warp_offset;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/**
* Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement. Specialized for no timeslicing.
*/
__device__ __forceinline__ void BlockedToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
Int2Type<false> time_slicing)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
__syncthreads();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
items[ITEM] = temp_storage[item_offset];
}
}
/**
* Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement. Specialized for warp-timeslicing.
*/
__device__ __forceinline__ void BlockedToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
Int2Type<true> time_slicing)
{
T temp_items[ITEMS_PER_THREAD];
#pragma unroll
for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
{
const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
__syncthreads();
if (warp_id == SLICE)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
}
__syncthreads();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
// Read a strip of items
const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
{
int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_items[ITEM] = temp_storage[item_offset];
}
}
}
}
// Copy
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
items[ITEM] = temp_items[ITEM];
}
}
/**
* Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
*/
__device__ __forceinline__ void BlockedToWarpStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
Int2Type<false> time_slicing)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
items[ITEM] = temp_storage[item_offset];
}
}
/**
* Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
*/
__device__ __forceinline__ void BlockedToWarpStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
Int2Type<true> time_slicing)
{
#pragma unroll
for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
{
__syncthreads();
if (warp_id == SLICE)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
items[ITEM] = temp_storage[item_offset];
}
}
}
}
/**
* Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement. Specialized for no timeslicing.
*/
__device__ __forceinline__ void StripedToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
Int2Type<false> time_slicing)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
__syncthreads();
// No timeslicing
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
items[ITEM] = temp_storage[item_offset];
}
}
/**
* Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement. Specialized for warp-timeslicing.
*/
__device__ __forceinline__ void StripedToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
Int2Type<true> time_slicing)
{
// Warp time-slicing
T temp_items[ITEMS_PER_THREAD];
#pragma unroll
for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
{
const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
__syncthreads();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
// Write a strip of items
const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
{
int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
}
}
__syncthreads();
if (warp_id == SLICE)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_items[ITEM] = temp_storage[item_offset];
}
}
}
// Copy
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
items[ITEM] = temp_items[ITEM];
}
}
/**
* Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement. Specialized for no timeslicing
*/
__device__ __forceinline__ void WarpStripedToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
Int2Type<false> time_slicing)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
items[ITEM] = temp_storage[item_offset];
}
}
/**
* Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement. Specialized for warp-timeslicing
*/
__device__ __forceinline__ void WarpStripedToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
Int2Type<true> time_slicing)
{
#pragma unroll
for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
{
__syncthreads();
if (warp_id == SLICE)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_storage[item_offset] = items[ITEM];
}
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
items[ITEM] = temp_storage[item_offset];
}
}
}
}
/**
* Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized for no timeslicing.
*/
__device__ __forceinline__ void ScatterToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
int ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<false> time_slicing)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ranks[ITEM];
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage[item_offset] = items[ITEM];
}
__syncthreads();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
items[ITEM] = temp_storage[item_offset];
}
}
/**
* Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized for warp-timeslicing.
*/
__device__ __forceinline__ void ScatterToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
int ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<true> time_slicing)
{
T temp_items[ITEMS_PER_THREAD];
#pragma unroll
for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
{
__syncthreads();
const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ranks[ITEM] - SLICE_OFFSET;
if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage[item_offset] = items[ITEM];
}
}
__syncthreads();
if (warp_id == SLICE)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_items[ITEM] = temp_storage[item_offset];
}
}
}
// Copy
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
items[ITEM] = temp_items[ITEM];
}
}
/**
* Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized for no timeslicing.
*/
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
int ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<false> time_slicing)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ranks[ITEM];
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage[item_offset] = items[ITEM];
}
__syncthreads();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
items[ITEM] = temp_storage[item_offset];
}
}
/**
* Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized for warp-timeslicing.
*/
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
int ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks
Int2Type<true> time_slicing)
{
T temp_items[ITEMS_PER_THREAD];
#pragma unroll
for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
{
const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
__syncthreads();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
int item_offset = ranks[ITEM] - SLICE_OFFSET;
if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
temp_storage[item_offset] = items[ITEM];
}
}
__syncthreads();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
// Read a strip of items
const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
{
int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
{
if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
temp_items[ITEM] = temp_storage[item_offset];
}
}
}
}
// Copy
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
items[ITEM] = temp_items[ITEM];
}
}
public:
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockExchange()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x),
warp_lane(linear_tid & (WARP_THREADS - 1)),
warp_id(linear_tid >> LOG_WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockExchange(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x),
warp_lane(linear_tid & (WARP_THREADS - 1)),
warp_id(linear_tid >> LOG_WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockExchange(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid),
warp_lane(linear_tid & (WARP_THREADS - 1)),
warp_id(linear_tid >> LOG_WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockExchange(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid),
warp_lane(linear_tid & (WARP_THREADS - 1)),
warp_id(linear_tid >> LOG_WARP_THREADS),
warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
{}
//@} end member group
/******************************************************************//**
* \name Structured exchanges
*********************************************************************/
//@{
/**
* \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
*
* \smemreuse
*
* The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Load a tile of ordered data into a striped arrangement across block threads
* int thread_data[4];
* cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
*
* // Collectively exchange data into a blocked arrangement across threads
* BlockExchange(temp_storage).StripedToBlocked(thread_data);
*
* \endcode
* \par
* Suppose the set of striped input \p thread_data across the block of threads is
* <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from global memory.
* The corresponding output \p thread_data in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
*/
__device__ __forceinline__ void StripedToBlocked(
T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
{
StripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
}
/**
* \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
*
* \smemreuse
*
* The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively exchange data into a striped arrangement across threads
* BlockExchange(temp_storage).BlockedToStriped(thread_data);
*
* // Store data striped across block threads into an ordered tile
* cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
*
* \endcode
* \par
* Suppose the set of blocked input \p thread_data across the block of threads is
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
* The corresponding output \p thread_data in those threads will be
* <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
* preparation for storing to global memory.
*
*/
__device__ __forceinline__ void BlockedToStriped(
T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
{
BlockedToStriped(items, Int2Type<WARP_TIME_SLICING>());
}
/**
* \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
*
* \smemreuse
*
* The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Load a tile of ordered data into a warp-striped arrangement across warp threads
* int thread_data[4];
* cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
*
* // Collectively exchange data into a blocked arrangement across threads
* BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
*
* \endcode
* \par
* Suppose the set of warp-striped input \p thread_data across the block of threads is
* <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
* after loading from global memory. (The first 128 items are striped across
* the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
* The corresponding output \p thread_data in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
*/
__device__ __forceinline__ void WarpStripedToBlocked(
T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
{
WarpStripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
}
/**
* \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
*
* \smemreuse
*
* The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
* of 512 integer items partitioned across 128 threads where each thread owns 4 items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockExchange for 128 threads owning 4 integer items each
* typedef cub::BlockExchange<int, 128, 4> BlockExchange;
*
* // Allocate shared memory for BlockExchange
* __shared__ typename BlockExchange::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively exchange data into a warp-striped arrangement across threads
* BlockExchange(temp_storage).BlockedToWarpStriped(thread_data);
*
* // Store data striped across warp threads into an ordered tile
* cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
*
* \endcode
* \par
* Suppose the set of blocked input \p thread_data across the block of threads is
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
* The corresponding output \p thread_data in those threads will be
* <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
* in preparation for storing to global memory. (The first 128 items are striped across
* the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
*
*/
__device__ __forceinline__ void BlockedToWarpStriped(
T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
{
BlockedToWarpStriped(items, Int2Type<WARP_TIME_SLICING>());
}
//@} end member group
/******************************************************************//**
* \name Scatter exchanges
*********************************************************************/
//@{
/**
* \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
*
* \smemreuse
*/
__device__ __forceinline__ void ScatterToBlocked(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
int ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
}
/**
* \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
*
* \smemreuse
*/
__device__ __forceinline__ void ScatterToStriped(
T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange
int ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks
{
ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,414 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
*/
#pragma once
#include "specializations/block_histogram_sort.cuh"
#include "specializations/block_histogram_atomic.cuh"
#include "../util_arch.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Algorithmic variants
******************************************************************************/
/**
* \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
*/
enum BlockHistogramAlgorithm
{
/**
* \par Overview
* Sorting followed by differentiation. Execution is comprised of two phases:
* -# Sort the data using efficient radix sort
* -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
*
* \par Performance Considerations
* Delivers consistent throughput regardless of sample bin distribution.
*/
BLOCK_HISTO_SORT,
/**
* \par Overview
* Use atomic addition to update byte counts directly
*
* \par Performance Considerations
* Performance is strongly tied to the hardware implementation of atomic
* addition, and may be significantly degraded for non uniformly-random
* input distributions where many concurrent updates are likely to be
* made to the same bin counter.
*/
BLOCK_HISTO_ATOMIC,
};
/******************************************************************************
* Block histogram
******************************************************************************/
/**
* \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
* \ingroup BlockModule
*
* \par Overview
* A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
* counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
*
* \par
* Optionally, BlockHistogram can be specialized to use different algorithms:
* -# <b>cub::BLOCK_HISTO_SORT</b>. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
* -# <b>cub::BLOCK_HISTO_ATOMIC</b>. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
*
* \tparam T The sample type being histogrammed (must be castable to an integer bin identifier)
* \tparam BLOCK_THREADS The thread block size in threads
* \tparam ITEMS_PER_THREAD The number of items per thread
* \tparam BINS The number bins within the histogram
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
*
* \par A Simple Example
* \blockcollective{BlockHistogram}
* \par
* The code snippet below illustrates a 256-bin histogram of 512 integer samples that
* are partitioned across 128 threads where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
* // Obtain input samples per thread
* unsigned char data[4];
* ...
*
* // Compute the block-wide histogram
* BlockHistogram(temp_storage).Histogram(data, smem_histogram);
*
* \endcode
*
* \par Performance and Usage Considerations
* - The histogram output can be constructed in shared or global memory
* - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
*
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int BINS,
BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT>
class BlockHistogram
{
private:
/******************************************************************************
* Constants and type definitions
******************************************************************************/
/**
* Ensure the template parameterization meets the requirements of the
* targeted device architecture. BLOCK_HISTO_ATOMIC can only be used
* on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used
* regardless.
*/
static const BlockHistogramAlgorithm SAFE_ALGORITHM =
((ALGORITHM == BLOCK_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
BLOCK_HISTO_SORT :
ALGORITHM;
/// Internal specialization.
typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
BlockHistogramSort<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS>,
BlockHistogramAtomic<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS> >::Type InternalBlockHistogram;
/// Shared memory storage layout type for BlockHistogram
typedef typename InternalBlockHistogram::TempStorage _TempStorage;
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
public:
/// \smemstorage{BlockHistogram}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockHistogram()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockHistogram(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockHistogram(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockHistogram(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
//@} end member group
/******************************************************************//**
* \name Histogram operations
*********************************************************************/
//@{
/**
* \brief Initialize the shared histogram counters to zero.
*
* The code snippet below illustrates a the initialization and update of a
* histogram of 512 integer samples that are partitioned across 128 threads
* where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
* // Obtain input samples per thread
* unsigned char thread_samples[4];
* ...
*
* // Initialize the block-wide histogram
* BlockHistogram(temp_storage).InitHistogram(smem_histogram);
*
* // Update the block-wide histogram
* BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
*
* \endcode
*
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
*/
template <typename HistoCounter>
__device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
{
// Initialize histogram bin counts to zeros
int histo_offset = 0;
#pragma unroll
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
{
histogram[histo_offset + linear_tid] = 0;
}
// Finish up with guarded initialization if necessary
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
{
histogram[histo_offset + linear_tid] = 0;
}
}
/**
* \brief Constructs a block-wide histogram in shared/global memory. Each thread contributes an array of input elements.
*
* \smemreuse
*
* The code snippet below illustrates a 256-bin histogram of 512 integer samples that
* are partitioned across 128 threads where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
* // Obtain input samples per thread
* unsigned char thread_samples[4];
* ...
*
* // Compute the block-wide histogram
* BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
*
* \endcode
*
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
*/
template <
typename HistoCounter>
__device__ __forceinline__ void Histogram(
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
{
// Initialize histogram bin counts to zeros
InitHistogram(histogram);
// Composite the histogram
InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
}
/**
* \brief Updates an existing block-wide histogram in shared/global memory. Each thread composites an array of input elements.
*
* \smemreuse
*
* The code snippet below illustrates a the initialization and update of a
* histogram of 512 integer samples that are partitioned across 128 threads
* where each thread owns 4 samples.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
* typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
*
* // Allocate shared memory for BlockHistogram
* __shared__ typename BlockHistogram::TempStorage temp_storage;
*
* // Allocate shared memory for block-wide histogram bin counts
* __shared__ unsigned int smem_histogram[256];
*
* // Obtain input samples per thread
* unsigned char thread_samples[4];
* ...
*
* // Initialize the block-wide histogram
* BlockHistogram(temp_storage).InitHistogram(smem_histogram);
*
* // Update the block-wide histogram
* BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
*
* \endcode
*
* \tparam HistoCounter <b>[inferred]</b> Histogram counter type
*/
template <
typename HistoCounter>
__device__ __forceinline__ void Composite(
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
{
InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,479 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
*/
#pragma once
#include "../util_arch.cuh"
#include "../util_type.cuh"
#include "../thread/thread_reduce.cuh"
#include "../thread/thread_scan.cuh"
#include "../block/block_scan.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
* \ingroup BlockModule
*
* \par Overview
* Blah...
*
* \tparam BLOCK_THREADS The thread block size in threads
* \tparam RADIX_BITS <b>[optional]</b> The number of radix bits per digit place (default: 5 bits)
* \tparam MEMOIZE_OUTER_SCAN <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
* \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
* \tparam SMEM_CONFIG <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
*
* \par Usage Considerations
* - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
* - Assumes a [<em>blocked arrangement</em>](index.html#sec5sec4) of elements across threads
* - \smemreuse{BlockRadixRank::TempStorage}
*
* \par Performance Considerations
*
* \par Algorithm
* These parallel radix ranking variants have <em>O</em>(<em>n</em>) work complexity and are implemented in XXX phases:
* -# blah
* -# blah
*
* \par Examples
* \par
* - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
* \code
* #include <cub/cub.cuh>
*
* template <int BLOCK_THREADS>
* __global__ void ExampleKernel(...)
* {
*
* \endcode
*/
template <
int BLOCK_THREADS,
int RADIX_BITS,
bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte>
class BlockRadixRank
{
private:
/******************************************************************************
* Type definitions and constants
******************************************************************************/
// Integer type for digit counters (to be packed into words of type PackedCounters)
typedef unsigned short DigitCounter;
// Integer type for packing DigitCounters into columns of shared memory banks
typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
unsigned long long,
unsigned int>::Type PackedCounter;
enum
{
RADIX_DIGITS = 1 << RADIX_BITS,
LOG_WARP_THREADS = PtxArchProps::LOG_WARP_THREADS,
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
BYTES_PER_COUNTER = sizeof(DigitCounter),
LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane
COUNTER_LANES = 1 << LOG_COUNTER_LANES,
// The number of packed counters per thread (plus one for padding)
RAKING_SEGMENT = COUNTER_LANES + 1,
LOG_SMEM_BANKS = PtxArchProps::LOG_SMEM_BANKS,
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
};
/// BlockScan type
typedef BlockScan<PackedCounter, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
/// Shared memory storage layout type for BlockRadixRank
struct _TempStorage
{
// Storage for scanning local ranks
typename BlockScan::TempStorage block_scan;
union
{
DigitCounter digit_counters[COUNTER_LANES + 1][BLOCK_THREADS][PACKING_RATIO];
PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
};
};
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
/// Copy of raking segment, promoted to registers
PackedCounter cached_segment[RAKING_SEGMENT];
/******************************************************************************
* Templated iteration
******************************************************************************/
// General template iteration
template <int COUNT, int MAX>
struct Iterate
{
/**
* Decode keys. Decodes the radix digit from the current digit place
* and increments the thread's corresponding counter in shared
* memory for that digit.
*
* Saves both (1) the prior value of that counter (the key's
* thread-local exclusive prefix sum for that digit), and (2) the shared
* memory offset of the counter (for later use).
*/
template <typename UnsignedBits, int KEYS_PER_THREAD>
static __device__ __forceinline__ void DecodeKeys(
BlockRadixRank &cta, // BlockRadixRank instance
UnsignedBits (&keys)[KEYS_PER_THREAD], // Key to decode
DigitCounter (&thread_prefixes)[KEYS_PER_THREAD], // Prefix counter value (out parameter)
DigitCounter* (&digit_counters)[KEYS_PER_THREAD], // Counter smem offset (out parameter)
int current_bit) // The least-significant bit position of the current digit to extract
{
// Add in sub-counter offset
UnsignedBits sub_counter = BFE(keys[COUNT], current_bit + LOG_COUNTER_LANES, LOG_PACKING_RATIO);
// Add in row offset
UnsignedBits row_offset = BFE(keys[COUNT], current_bit, LOG_COUNTER_LANES);
// Pointer to smem digit counter
digit_counters[COUNT] = &cta.temp_storage.digit_counters[row_offset][cta.linear_tid][sub_counter];
// Load thread-exclusive prefix
thread_prefixes[COUNT] = *digit_counters[COUNT];
// Store inclusive prefix
*digit_counters[COUNT] = thread_prefixes[COUNT] + 1;
// Iterate next key
Iterate<COUNT + 1, MAX>::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit);
}
// Termination
template <int KEYS_PER_THREAD>
static __device__ __forceinline__ void UpdateRanks(
int (&ranks)[KEYS_PER_THREAD], // Local ranks (out parameter)
DigitCounter (&thread_prefixes)[KEYS_PER_THREAD], // Prefix counter value
DigitCounter* (&digit_counters)[KEYS_PER_THREAD]) // Counter smem offset
{
// Add in threadblock exclusive prefix
ranks[COUNT] = thread_prefixes[COUNT] + *digit_counters[COUNT];
// Iterate next key
Iterate<COUNT + 1, MAX>::UpdateRanks(ranks, thread_prefixes, digit_counters);
}
};
// Termination
template <int MAX>
struct Iterate<MAX, MAX>
{
// DecodeKeys
template <typename UnsignedBits, int KEYS_PER_THREAD>
static __device__ __forceinline__ void DecodeKeys(
BlockRadixRank &cta,
UnsignedBits (&keys)[KEYS_PER_THREAD],
DigitCounter (&thread_prefixes)[KEYS_PER_THREAD],
DigitCounter* (&digit_counters)[KEYS_PER_THREAD],
int current_bit) {}
// UpdateRanks
template <int KEYS_PER_THREAD>
static __device__ __forceinline__ void UpdateRanks(
int (&ranks)[KEYS_PER_THREAD],
DigitCounter (&thread_prefixes)[KEYS_PER_THREAD],
DigitCounter *(&digit_counters)[KEYS_PER_THREAD]) {}
};
/******************************************************************************
* Utility methods
******************************************************************************/
/**
* Internal storage allocator
*/
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/**
* Performs upsweep raking reduction, returning the aggregate
*/
__device__ __forceinline__ PackedCounter Upsweep()
{
PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
PackedCounter *raking_ptr;
if (MEMOIZE_OUTER_SCAN)
{
// Copy data into registers
#pragma unroll
for (int i = 0; i < RAKING_SEGMENT; i++)
{
cached_segment[i] = smem_raking_ptr[i];
}
raking_ptr = cached_segment;
}
else
{
raking_ptr = smem_raking_ptr;
}
return ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
}
/// Performs exclusive downsweep raking scan
__device__ __forceinline__ void ExclusiveDownsweep(
PackedCounter raking_partial)
{
PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
cached_segment :
smem_raking_ptr;
// Exclusive raking downsweep scan
ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
if (MEMOIZE_OUTER_SCAN)
{
// Copy data back to smem
#pragma unroll
for (int i = 0; i < RAKING_SEGMENT; i++)
{
smem_raking_ptr[i] = cached_segment[i];
}
}
}
/**
* Reset shared memory digit counters
*/
__device__ __forceinline__ void ResetCounters()
{
// Reset shared memory digit counters
#pragma unroll
for (int LANE = 0; LANE < COUNTER_LANES + 1; LANE++)
{
*((PackedCounter*) temp_storage.digit_counters[LANE][linear_tid]) = 0;
}
}
/**
* Scan shared memory digit counters.
*/
__device__ __forceinline__ void ScanCounters()
{
// Upsweep scan
PackedCounter raking_partial = Upsweep();
// Compute inclusive sum
PackedCounter inclusive_partial;
PackedCounter packed_aggregate;
BlockScan(temp_storage.block_scan, linear_tid).InclusiveSum(raking_partial, inclusive_partial, packed_aggregate);
// Propagate totals in packed fields
#pragma unroll
for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
{
inclusive_partial += packed_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
}
// Downsweep scan with exclusive partial
PackedCounter exclusive_partial = inclusive_partial - raking_partial;
ExclusiveDownsweep(exclusive_partial);
}
public:
/// \smemstorage{BlockScan}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockRadixRank()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockRadixRank(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockRadixRank(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockRadixRank(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
//@} end member group
/******************************************************************//**
* \name Raking
*********************************************************************/
//@{
/**
* \brief Rank keys.
*/
template <
typename UnsignedBits,
int KEYS_PER_THREAD>
__device__ __forceinline__ void RankKeys(
UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile
int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile
int current_bit) ///< [in] The least-significant bit position of the current digit to extract
{
DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit
DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem
// Reset shared memory digit counters
ResetCounters();
// Decode keys and update digit counters
Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit);
__syncthreads();
// Scan shared memory counters
ScanCounters();
__syncthreads();
// Extract the local ranks of each key
Iterate<0, KEYS_PER_THREAD>::UpdateRanks(ranks, thread_prefixes, digit_counters);
}
/**
* \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
*/
template <
typename UnsignedBits,
int KEYS_PER_THREAD>
__device__ __forceinline__ void RankKeys(
UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile
int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter)
int current_bit, ///< [in] The least-significant bit position of the current digit to extract
int &inclusive_digit_prefix) ///< [out] The incluisve prefix sum for the digit threadIdx.x
{
// Rank keys
RankKeys(keys, ranks, current_bit);
// Get the inclusive and exclusive digit totals corresponding to the calling thread.
if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS))
{
// Obtain ex/inclusive digit counts. (Unfortunately these all reside in the
// first counter column, resulting in unavoidable bank conflicts.)
int counter_lane = (linear_tid & (COUNTER_LANES - 1));
int sub_counter = linear_tid >> (LOG_COUNTER_LANES);
inclusive_digit_prefix = temp_storage.digit_counters[counter_lane + 1][0][sub_counter];
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,608 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
*/
#pragma once
#include "../util_namespace.cuh"
#include "../util_arch.cuh"
#include "../util_type.cuh"
#include "block_exchange.cuh"
#include "block_radix_rank.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png)
* \ingroup BlockModule
*
* \par Overview
* The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
* items into ascending order. It relies upon a positional representation for
* keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
* characters, etc.) specified from least-significant to most-significant. For a
* given input sequence of keys and a set of rules specifying a total ordering
* of the symbolic alphabet, the radix sorting method produces a lexicographic
* ordering of those keys.
*
* \par
* BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
* <tt>unsigned char</tt>, \p int, \p double, etc. Within each key, the implementation treats fixed-length
* bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting
* method can only be applied to unsigned integral types, BlockRadixSort
* is able to sort signed and floating-point types via simple bit-wise transformations
* that ensure lexicographic key ordering.
*
* \tparam Key Key type
* \tparam BLOCK_THREADS The thread block size in threads
* \tparam ITEMS_PER_THREAD The number of items per thread
* \tparam Value <b>[optional]</b> Value type (default: cub::NullType)
* \tparam RADIX_BITS <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
* \tparam MEMOIZE_OUTER_SCAN <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
* \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
* \tparam SMEM_CONFIG <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
*
* \par A Simple Example
* \blockcollective{BlockRadixSort}
* \par
* The code snippet below illustrates a sort of 512 integer keys that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockRadixSort for 128 threads owning 4 integer items each
* typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
*
* // Allocate shared memory for BlockRadixSort
* __shared__ typename BlockRadixSort::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_keys[4];
* ...
*
* // Collectively sort the keys
* BlockRadixSort(temp_storage).Sort(thread_keys);
*
* ...
* \endcode
* \par
* Suppose the set of input \p thread_keys across the block of threads is
* <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The
* corresponding output \p thread_keys in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
*/
template <
typename Key,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
typename Value = NullType,
int RADIX_BITS = 4,
bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte>
class BlockRadixSort
{
private:
/******************************************************************************
* Constants and type definitions
******************************************************************************/
// Key traits and unsigned bits type
typedef NumericTraits<Key> KeyTraits;
typedef typename KeyTraits::UnsignedBits UnsignedBits;
/// BlockRadixRank utility type
typedef BlockRadixRank<BLOCK_THREADS, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixRank;
/// BlockExchange utility type for keys
typedef BlockExchange<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeKeys;
/// BlockExchange utility type for values
typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeValues;
/// Shared memory storage layout type
struct _TempStorage
{
union
{
typename BlockRadixRank::TempStorage ranking_storage;
typename BlockExchangeKeys::TempStorage exchange_keys;
typename BlockExchangeValues::TempStorage exchange_values;
};
};
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
public:
/// \smemstorage{BlockScan}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockRadixSort()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockRadixSort(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockRadixSort(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockRadixSort(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
//@} end member group
/******************************************************************//**
* \name Sorting (blocked arrangements)
*********************************************************************/
//@{
/**
* \brief Performs a block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys.
*
* \smemreuse
*
* The code snippet below illustrates a sort of 512 integer keys that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive keys.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
* typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
*
* // Allocate shared memory for BlockRadixSort
* __shared__ typename BlockRadixSort::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_keys[4];
* ...
*
* // Collectively sort the keys
* BlockRadixSort(temp_storage).Sort(thread_keys);
*
* \endcode
* \par
* Suppose the set of input \p thread_keys across the block of threads is
* <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
* The corresponding output \p thread_keys in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*/
__device__ __forceinline__ void Sort(
Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
{
UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
// Twiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
}
// Radix sorting passes
while (true)
{
// Rank the blocked keys
int ranks[ITEMS_PER_THREAD];
BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
begin_bit += RADIX_BITS;
__syncthreads();
// Exchange keys through shared memory in blocked arrangement
BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
// Quit if done
if (begin_bit >= end_bit) break;
__syncthreads();
}
// Untwiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
}
}
/**
* \brief Performs a block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values.
*
* BlockRadixSort can only accommodate one associated tile of values. To "truck along"
* more than one tile of values, simply perform a key-value sort of the keys paired
* with a temporary value array that enumerates the key indices. The reordered indices
* can then be used as a gather-vector for exchanging other associated tile data through
* shared memory.
*
* \smemreuse
*
* The code snippet below illustrates a sort of 512 integer keys and values that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive pairs.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
* typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
*
* // Allocate shared memory for BlockRadixSort
* __shared__ typename BlockRadixSort::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_keys[4];
* int thread_values[4];
* ...
*
* // Collectively sort the keys and values among block threads
* BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
*
* \endcode
* \par
* Suppose the set of input \p thread_keys across the block of threads is
* <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The
* corresponding output \p thread_keys in those threads will be
* <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
*
*/
__device__ __forceinline__ void Sort(
Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort
Value (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
{
UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
// Twiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
}
// Radix sorting passes
while (true)
{
// Rank the blocked keys
int ranks[ITEMS_PER_THREAD];
BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
begin_bit += RADIX_BITS;
__syncthreads();
// Exchange keys through shared memory in blocked arrangement
BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
__syncthreads();
// Exchange values through shared memory in blocked arrangement
BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
// Quit if done
if (begin_bit >= end_bit) break;
__syncthreads();
}
// Untwiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
}
}
//@} end member group
/******************************************************************//**
* \name Sorting (blocked arrangement -> striped arrangement)
*********************************************************************/
//@{
/**
* \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
*
* \smemreuse
*
* The code snippet below illustrates a sort of 512 integer keys that
* are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive keys. The final partitioning is striped.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
* typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
*
* // Allocate shared memory for BlockRadixSort
* __shared__ typename BlockRadixSort::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_keys[4];
* ...
*
* // Collectively sort the keys
* BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
*
* \endcode
* \par
* Suppose the set of input \p thread_keys across the block of threads is
* <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The
* corresponding output \p thread_keys in those threads will be
* <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
*
*/
__device__ __forceinline__ void SortBlockedToStriped(
Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
{
UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
// Twiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
}
// Radix sorting passes
while (true)
{
// Rank the blocked keys
int ranks[ITEMS_PER_THREAD];
BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
begin_bit += RADIX_BITS;
__syncthreads();
// Check if this is the last pass
if (begin_bit >= end_bit)
{
// Last pass exchanges keys through shared memory in striped arrangement
BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
// Quit
break;
}
// Exchange keys through shared memory in blocked arrangement
BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
__syncthreads();
}
// Untwiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
}
}
/**
* \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
*
* BlockRadixSort can only accommodate one associated tile of values. To "truck along"
* more than one tile of values, simply perform a key-value sort of the keys paired
* with a temporary value array that enumerates the key indices. The reordered indices
* can then be used as a gather-vector for exchanging other associated tile data through
* shared memory.
*
* \smemreuse
*
* The code snippet below illustrates a sort of 512 integer keys and values that
* are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive pairs. The final partitioning is striped.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
* typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
*
* // Allocate shared memory for BlockRadixSort
* __shared__ typename BlockRadixSort::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_keys[4];
* int thread_values[4];
* ...
*
* // Collectively sort the keys and values among block threads
* BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
*
* \endcode
* \par
* Suppose the set of input \p thread_keys across the block of threads is
* <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>. The
* corresponding output \p thread_keys in those threads will be
* <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
*
*/
__device__ __forceinline__ void SortBlockedToStriped(
Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort
Value (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort
int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8) ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
{
UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
// Twiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
}
// Radix sorting passes
while (true)
{
// Rank the blocked keys
int ranks[ITEMS_PER_THREAD];
BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
begin_bit += RADIX_BITS;
__syncthreads();
// Check if this is the last pass
if (begin_bit >= end_bit)
{
// Last pass exchanges keys through shared memory in striped arrangement
BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
__syncthreads();
// Last pass exchanges through shared memory in striped arrangement
BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToStriped(values, ranks);
// Quit
break;
}
// Exchange keys through shared memory in blocked arrangement
BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
__syncthreads();
// Exchange values through shared memory in blocked arrangement
BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
__syncthreads();
}
// Untwiddle bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
}
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,145 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
*/
#pragma once
#include "../util_macro.cuh"
#include "../util_arch.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for raking across thread block data. ![](raking.png)
* \ingroup BlockModule
*
* \par Overview
* This type facilitates a shared memory usage pattern where a block of CUDA
* threads places elements into shared memory and then reduces the active
* parallelism to one "raking" warp of threads for serially aggregating consecutive
* sequences of shared items. Padding is inserted to eliminate bank conflicts
* (for most data types).
*
* \tparam T The data type to be exchanged.
* \tparam BLOCK_THREADS The thread block size in threads.
* \tparam BLOCK_STRIPS When strip-mining, the number of threadblock-strips per tile
*/
template <
typename T,
int BLOCK_THREADS,
int BLOCK_STRIPS = 1>
struct BlockRakingLayout
{
//---------------------------------------------------------------------
// Constants and typedefs
//---------------------------------------------------------------------
enum
{
/// The total number of elements that need to be cooperatively reduced
SHARED_ELEMENTS =
BLOCK_THREADS * BLOCK_STRIPS,
/// Maximum number of warp-synchronous raking threads
MAX_RAKING_THREADS =
CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
/// Number of raking elements per warp-synchronous raking thread (rounded up)
SEGMENT_LENGTH =
(SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
/// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
RAKING_THREADS =
(SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
/// Pad each segment length with one element if it evenly divides the number of banks
SEGMENT_PADDING =
(PtxArchProps::SMEM_BANKS % SEGMENT_LENGTH == 0) ? 1 : 0,
/// Total number of elements in the raking grid
GRID_ELEMENTS =
RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
/// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the warp size)
UNGUARDED =
(SHARED_ELEMENTS % RAKING_THREADS == 0),
};
/**
* \brief Shared memory storage type
*/
typedef T TempStorage[BlockRakingLayout::GRID_ELEMENTS];
/**
* \brief Returns the location for the calling thread to place data into the grid
*/
static __device__ __forceinline__ T* PlacementPtr(
TempStorage &temp_storage,
int linear_tid,
int block_strip = 0)
{
// Offset for partial
unsigned int offset = (block_strip * BLOCK_THREADS) + linear_tid;
// Add in one padding element for every segment
if (SEGMENT_PADDING > 0)
{
offset += offset / SEGMENT_LENGTH;
}
// Incorporating a block of padding partials every shared memory segment
return temp_storage + offset;
}
/**
* \brief Returns the location for the calling thread to begin sequential raking
*/
static __device__ __forceinline__ T* RakingPtr(
TempStorage &temp_storage,
int linear_tid)
{
return temp_storage + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,563 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
*/
#pragma once
#include "specializations/block_reduce_raking.cuh"
#include "specializations/block_reduce_warp_reductions.cuh"
#include "../util_type.cuh"
#include "../thread/thread_operators.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Algorithmic variants
******************************************************************************/
/**
* BlockReduceAlgorithm enumerates alternative algorithms for parallel
* reduction across a CUDA threadblock.
*/
enum BlockReduceAlgorithm
{
/**
* \par Overview
* An efficient "raking" reduction algorithm. Execution is comprised of
* three phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within a
* single warp rake across segments of shared partial reductions.
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
*
* \par
* \image html block_reduce.png
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - Although this variant may suffer longer turnaround latencies when the
* GPU is under-occupied, it can often provide higher overall throughput
* across the GPU when suitably occupied.
*/
BLOCK_REDUCE_RAKING,
/**
* \par Overview
* A quick "tiled warp-reductions" reduction algorithm. Execution is
* comprised of four phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
* reduction within each warp.
* -# A propagation phase where the warp reduction outputs in each warp are
* updated with the aggregate from each preceding warp.
*
* \par
* \image html block_scan_warpscans.png
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - Although this variant may suffer lower overall throughput across the
* GPU because due to a heavy reliance on inefficient warp-reductions, it
* can often provide lower turnaround latencies when the GPU is
* under-occupied.
*/
BLOCK_REDUCE_WARP_REDUCTIONS,
};
/******************************************************************************
* Block reduce
******************************************************************************/
/**
* \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
* \ingroup BlockModule
*
* \par Overview
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
*
* \par
* Optionally, BlockReduce can be specialized by algorithm to accommodate different latency/throughput workload profiles:
* -# <b>cub::BLOCK_REDUCE_RAKING</b>. An efficient "raking" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
* -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>. A quick "tiled warp-reductions" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
*
* \tparam T Data type being reduced
* \tparam BLOCK_THREADS The thread block size in threads
* \tparam ALGORITHM <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_RAKING)
*
* \par Performance Considerations
* - Very efficient (only one synchronization barrier).
* - Zero bank conflicts for most types.
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
* - Summation (<b><em>vs.</em></b> generic reduction)
* - \p BLOCK_THREADS is a multiple of the architecture's warp size
* - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
* - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
*
* \par A Simple Example
* \blockcollective{BlockReduce}
* \par
* The code snippet below illustrates a sum reduction of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
*
* \endcode
*
*/
template <
typename T,
int BLOCK_THREADS,
BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_RAKING>
class BlockReduce
{
private:
/******************************************************************************
* Constants and typedefs
******************************************************************************/
/// Internal specialization.
typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
BlockReduceWarpReductions<T, BLOCK_THREADS>,
BlockReduceRaking<T, BLOCK_THREADS> >::Type InternalBlockReduce;
/// Shared memory storage layout type for BlockReduce
typedef typename InternalBlockReduce::TempStorage _TempStorage;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
public:
/// \smemstorage{BlockReduce}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockReduce()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockReduce(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockReduce(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockReduce(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
//@} end member group
/******************************************************************//**
* \name Generic reductions
*********************************************************************/
//@{
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. Each thread contributes one input element.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* Supports non-commutative reduction operators.
*
* \smemreuse
*
* The code snippet below illustrates a max reduction of 128 integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Each thread obtains an input item
* int thread_data;
* ...
*
* // Compute the block-wide max for thread0
* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
*
* \endcode
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. Each thread contributes an array of consecutive input elements.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* Supports non-commutative reduction operators.
*
* \blocked
*
* \smemreuse
*
* The code snippet below illustrates a max reduction of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Compute the block-wide max for thread0
* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
*
* \endcode
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int ITEMS_PER_THREAD,
typename ReductionOp>
__device__ __forceinline__ T Reduce(
T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
// Reduce partials
T partial = ThreadReduce(inputs, reduction_op);
return Reduce(partial, reduction_op);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor. The first \p num_valid threads each contribute one input element.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* Supports non-commutative reduction operators.
*
* \blocked
*
* \smemreuse
*
* The code snippet below illustrates a max reduction of a partially-full tile of integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int num_valid, ...)
* {
* // Specialize BlockReduce for 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Each thread obtains an input item
* int thread_data;
* if (threadIdx.x < num_valid) thread_data = ...
*
* // Compute the block-wide max for thread0
* int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
*
* \endcode
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input
ReductionOp reduction_op, ///< [in] Binary reduction operator
int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
{
// Determine if we scan skip bounds checking
if (num_valid >= BLOCK_THREADS)
{
return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, num_valid, reduction_op);
}
else
{
return InternalBlockReduce(temp_storage, linear_tid).template Reduce<false>(input, num_valid, reduction_op);
}
}
//@} end member group
/******************************************************************//**
* \name Summation reductions
*********************************************************************/
//@{
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes one input element.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* \smemreuse
*
* The code snippet below illustrates a sum reduction of 128 integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Each thread obtains an input item
* int thread_data;
* ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
*
* \endcode
*
*/
__device__ __forceinline__ T Sum(
T input) ///< [in] Calling thread's input
{
return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, BLOCK_THREADS);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* \smemreuse
*
* The code snippet below illustrates a sum reduction of 512 integer items that
* are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
* where each thread owns 4 consecutive items.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockReduce for 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data);
*
* \endcode
*
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
*/
template <int ITEMS_PER_THREAD>
__device__ __forceinline__ T Sum(
T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment
{
// Reduce partials
T partial = ThreadReduce(inputs, cub::Sum());
return Sum(partial);
}
/**
* \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element.
*
* The return value is undefined in threads other than thread<sub>0</sub>.
*
* \smemreuse
*
* The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
* are partitioned across 128 threads.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int num_valid, ...)
* {
* // Specialize BlockReduce for 128 threads on type int
* typedef cub::BlockReduce<int, 128> BlockReduce;
*
* // Allocate shared memory for BlockReduce
* __shared__ typename BlockReduce::TempStorage temp_storage;
*
* // Each thread obtains an input item (up to num_items)
* int thread_data;
* if (threadIdx.x < num_valid)
* thread_data = ...
*
* // Compute the block-wide sum for thread0
* int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
*
* \endcode
*
*/
__device__ __forceinline__ T Sum(
T input, ///< [in] Calling thread's input
int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
{
// Determine if we scan skip bounds checking
if (num_valid >= BLOCK_THREADS)
{
return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, num_valid);
}
else
{
return InternalBlockReduce(temp_storage, linear_tid).template Sum<false>(input, num_valid);
}
}
//@} end member group
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,926 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Operations for writing linear segments of data from the CUDA thread block
*/
#pragma once
#include <iterator>
#include "../util_namespace.cuh"
#include "../util_macro.cuh"
#include "../util_type.cuh"
#include "../util_vector.cuh"
#include "../thread/thread_store.cuh"
#include "block_exchange.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup IoModule
* @{
*/
/******************************************************************//**
* \name Blocked I/O
*********************************************************************/
//@{
/**
* \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
*
* \blocked
*
* \tparam MODIFIER cub::PtxStoreModifier cache modifier.
* \tparam T <b>[inferred]</b> The data type to store.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
*/
template <
PtxStoreModifier MODIFIER,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void StoreBlocked(
int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
// Store directly in thread-blocked order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
}
}
/**
* \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier, guarded by range
*
* \blocked
*
* \tparam MODIFIER cub::PtxStoreModifier cache modifier.
* \tparam T <b>[inferred]</b> The data type to store.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
*/
template <
PtxStoreModifier MODIFIER,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void StoreBlocked(
int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
// Store directly in thread-blocked order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
{
ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
}
}
}
//@} end member group
/******************************************************************//**
* \name Striped I/O
*********************************************************************/
//@{
/**
* \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
*
* \striped
*
* \tparam MODIFIER cub::PtxStoreModifier cache modifier.
* \tparam BLOCK_THREADS The thread block size in threads
* \tparam T <b>[inferred]</b> The data type to store.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
*/
template <
PtxStoreModifier MODIFIER,
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void StoreStriped(
int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
// Store directly in striped order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
}
}
/**
* \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
*
* \striped
*
* \tparam MODIFIER cub::PtxStoreModifier cache modifier.
* \tparam BLOCK_THREADS The thread block size in threads
* \tparam T <b>[inferred]</b> The data type to store.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
*/
template <
PtxStoreModifier MODIFIER,
int BLOCK_THREADS,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void StoreStriped(
int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
// Store directly in striped order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
{
ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
}
}
}
//@} end member group
/******************************************************************//**
* \name Warp-striped I/O
*********************************************************************/
//@{
/**
* \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
*
* \warpstriped
*
* \par Usage Considerations
* The number of threads in the thread block must be a multiple of the architecture's warp size.
*
* \tparam MODIFIER cub::PtxStoreModifier cache modifier.
* \tparam T <b>[inferred]</b> The data type to store.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
*/
template <
PtxStoreModifier MODIFIER,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void StoreWarpStriped(
int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
// Store directly in warp-striped order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
}
}
/**
* \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
*
* \warpstriped
*
* \par Usage Considerations
* The number of threads in the thread block must be a multiple of the architecture's warp size.
*
* \tparam MODIFIER cub::PtxStoreModifier cache modifier.
* \tparam T <b>[inferred]</b> The data type to store.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
*/
template <
PtxStoreModifier MODIFIER,
typename T,
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void StoreWarpStriped(
int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
int tid = linear_tid & (PtxArchProps::WARP_THREADS - 1);
int wid = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
// Store directly in warp-striped order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if (warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS) < valid_items)
{
ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
}
}
}
//@} end member group
/******************************************************************//**
* \name Blocked, vectorized I/O
*********************************************************************/
//@{
/**
* \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
*
* \blocked
*
* The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
* which is the default starting offset returned by \p cudaMalloc()
*
* \par
* The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
* - \p ITEMS_PER_THREAD is odd
* - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
*
* \tparam MODIFIER cub::PtxStoreModifier cache modifier.
* \tparam T <b>[inferred]</b> The data type to store.
* \tparam ITEMS_PER_THREAD <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
*
*/
template <
PtxStoreModifier MODIFIER,
typename T,
int ITEMS_PER_THREAD>
__device__ __forceinline__ void StoreBlockedVectorized(
int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
T *block_ptr, ///< [in] Input pointer for storing from
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
enum
{
// Maximum CUDA vector size is 4 elements
MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
// Vector size must be a power of two and an even divisor of the items per thread
VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
MAX_VEC_SIZE :
1,
VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
};
// Vector type
typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
// Alias global pointer
Vector *block_ptr_vectors = reinterpret_cast<Vector *>(block_ptr);
// Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
Vector raw_vector[VECTORS_PER_THREAD];
T *raw_items = reinterpret_cast<T*>(raw_vector);
// Copy
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
raw_items[ITEM] = items[ITEM];
}
// Direct-store using vector types
StoreBlocked<MODIFIER>(linear_tid, block_ptr_vectors, raw_vector);
}
//@} end member group
/** @} */ // end group IoModule
//-----------------------------------------------------------------------------
// Generic BlockStore abstraction
//-----------------------------------------------------------------------------
/**
* \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
*/
enum BlockStoreAlgorithm
{
/**
* \par Overview
*
* A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
* directly to memory. The thread block writes items in a parallel "raking" fashion:
* thread<sub><em>i</em></sub> writes the <em>i</em><sup>th</sup> segment of consecutive elements.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) decreases as the
* access stride between threads increases (i.e., the number items per thread).
*/
BLOCK_STORE_DIRECT,
/**
* \par Overview
*
* A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written directly
* to memory using CUDA's built-in vectorized stores as a coalescing optimization.
* The thread block writes items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector stores to
* write the <em>i</em><sup>th</sup> segment of consecutive elements.
*
* For example, <tt>st.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) remains high until the the
* access stride between threads (i.e., the number items per thread) exceeds the
* maximum vector store width (typically 4 items or 64B, whichever is lower).
* - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
* - \p ITEMS_PER_THREAD is odd
* - The \p OutputIteratorRA is not a simple pointer type
* - The block output offset is not quadword-aligned
* - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
*/
BLOCK_STORE_VECTORIZE,
/**
* \par Overview
* A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
* transposed into a [<em>striped arrangement</em>](index.html#sec5sec4)
* which is then written to memory. More specifically, cub::BlockExchange
* used to locally reorder the items into a
* [<em>striped arrangement</em>](index.html#sec5sec4), after which the
* thread block writes items in a parallel "strip-mining" fashion: consecutive
* items owned by thread<sub><em>i</em></sub> are written to memory with
* stride \p BLOCK_THREADS between them.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) remains high regardless
* of items written per thread.
* - The local reordering incurs slightly longer latencies and throughput than the
* direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
*/
BLOCK_STORE_TRANSPOSE,
/**
* \par Overview
* A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
* transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4)
* which is then written to memory. More specifically, cub::BlockExchange used
* to locally reorder the items into a
* [<em>warp-striped arrangement</em>](index.html#sec5sec4), after which
* each warp writes its own contiguous segment in a parallel "strip-mining" fashion:
* consecutive items owned by lane<sub><em>i</em></sub> are written to memory
* with stride \p WARP_THREADS between them.
*
* \par Performance Considerations
* - The utilization of memory transactions (coalescing) remains high regardless
* of items written per thread.
* - The local reordering incurs slightly longer latencies and throughput than the
* direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
*/
BLOCK_STORE_WARP_TRANSPOSE,
};
/**
* \addtogroup BlockModule
* @{
*/
/**
* \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec4) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png)
*
* \par Overview
* The BlockStore class provides a single data movement abstraction that can be specialized
* to implement different cub::BlockStoreAlgorithm strategies. This facilitates different
* performance policies for different architectures, data types, granularity sizes, etc.
*
* \par Optionally, BlockStore can be specialized by different data movement strategies:
* -# <b>cub::BLOCK_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
* directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
* -# <b>cub::BLOCK_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec4)
* of data is written directly to memory using CUDA's built-in vectorized stores as a
* coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm)
* -# <b>cub::BLOCK_STORE_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec4)
* is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec4) which is
* then written to memory. [More...](\ref cub::BlockStoreAlgorithm)
* -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec4)
* is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4) which is
* then written to memory. [More...](\ref cub::BlockStoreAlgorithm)
*
* \tparam OutputIteratorRA The input iterator type (may be a simple pointer type).
* \tparam BLOCK_THREADS The thread block size in threads.
* \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread.
* \tparam ALGORITHM <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT.
* \tparam MODIFIER <b>[optional]</b> cub::PtxStoreModifier cache modifier. default: cub::STORE_DEFAULT.
* \tparam WARP_TIME_SLICING <b>[optional]</b> For transposition-based cub::BlockStoreAlgorithm parameterizations that utilize shared memory: When \p true, only use enough shared memory for a single warp's worth of data, time-slicing the block-wide exchange over multiple synchronized rounds (default: false)
*
* \par A Simple Example
* \blockcollective{BlockStore}
* \par
* The code snippet below illustrates the storing of a "blocked" arrangement
* of 512 integers across 128 threads (where each thread owns 4 consecutive items)
* into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
* meaning items are locally reordered among threads so that memory references will be
* efficiently coalesced using a warp-striped access pattern.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockStore for 128 threads owning 4 integer items each
* typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
*
* // Allocate shared memory for BlockStore
* __shared__ typename BlockStore::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Store items to linear memory
* int thread_data[4];
* BlockStore(temp_storage).Store(d_data, thread_data);
*
* \endcode
* \par
* Suppose the set of \p thread_data across the block of threads is
* <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
* The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
*
*/
template <
typename OutputIteratorRA,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
PtxStoreModifier MODIFIER = STORE_DEFAULT,
bool WARP_TIME_SLICING = false>
class BlockStore
{
private:
/******************************************************************************
* Constants and typed definitions
******************************************************************************/
// Data type of input iterator
typedef typename std::iterator_traits<OutputIteratorRA>::value_type T;
/******************************************************************************
* Algorithmic variants
******************************************************************************/
/// Store helper
template <BlockStoreAlgorithm _POLICY, int DUMMY = 0>
struct StoreInternal;
/**
* BLOCK_STORE_DIRECT specialization of store helper
*/
template <int DUMMY>
struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
{
/// Shared memory storage layout type
typedef NullType TempStorage;
/// Linear thread-id
int linear_tid;
/// Constructor
__device__ __forceinline__ StoreInternal(
TempStorage &temp_storage,
int linear_tid)
:
linear_tid(linear_tid)
{}
/// Store items into a linear segment of memory
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
}
/// Store items into a linear segment of memory, guarded by range
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
}
};
/**
* BLOCK_STORE_VECTORIZE specialization of store helper
*/
template <int DUMMY>
struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
{
/// Shared memory storage layout type
typedef NullType TempStorage;
/// Linear thread-id
int linear_tid;
/// Constructor
__device__ __forceinline__ StoreInternal(
TempStorage &temp_storage,
int linear_tid)
:
linear_tid(linear_tid)
{}
/// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
__device__ __forceinline__ void Store(
T *block_ptr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
StoreBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
}
/// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
template <typename _OutputIteratorRA>
__device__ __forceinline__ void Store(
_OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
}
/// Store items into a linear segment of memory, guarded by range
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
}
};
/**
* BLOCK_STORE_TRANSPOSE specialization of store helper
*/
template <int DUMMY>
struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
{
// BlockExchange utility type for keys
typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
/// Shared memory storage layout type
typedef typename BlockExchange::TempStorage _TempStorage;
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
/// Thread reference to shared storage
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
/// Constructor
__device__ __forceinline__ StoreInternal(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
/// Store items into a linear segment of memory
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockExchange(temp_storage).BlockedToStriped(items);
StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
}
/// Store items into a linear segment of memory, guarded by range
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
BlockExchange(temp_storage).BlockedToStriped(items);
StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
}
};
/**
* BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
*/
template <int DUMMY>
struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
{
enum
{
WARP_THREADS = PtxArchProps::WARP_THREADS
};
// Assert BLOCK_THREADS must be a multiple of WARP_THREADS
CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
// BlockExchange utility type for keys
typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
/// Shared memory storage layout type
typedef typename BlockExchange::TempStorage _TempStorage;
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
/// Thread reference to shared storage
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
/// Constructor
__device__ __forceinline__ StoreInternal(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
/// Store items into a linear segment of memory
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockExchange(temp_storage).BlockedToWarpStriped(items);
StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items);
}
/// Store items into a linear segment of memory, guarded by range
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
BlockExchange(temp_storage).BlockedToWarpStriped(items);
StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
}
};
/******************************************************************************
* Type definitions
******************************************************************************/
/// Internal load implementation to use
typedef StoreInternal<ALGORITHM> InternalStore;
/// Shared memory storage layout type
typedef typename InternalStore::TempStorage _TempStorage;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/******************************************************************************
* Thread fields
******************************************************************************/
/// Thread reference to shared storage
_TempStorage &temp_storage;
/// Linear thread-id
int linear_tid;
public:
/// \smemstorage{BlockStore}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockStore()
:
temp_storage(PrivateStorage()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Threads are identified using <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ BlockStore(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
linear_tid(threadIdx.x)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Each thread is identified using the supplied linear thread identifier
*/
__device__ __forceinline__ BlockStore(
int linear_tid) ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(PrivateStorage()),
linear_tid(linear_tid)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Each thread is identified using the supplied linear thread identifier.
*/
__device__ __forceinline__ BlockStore(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int linear_tid) ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
//@} end member group
/******************************************************************//**
* \name Data movement
*********************************************************************/
//@{
/**
* \brief Store items into a linear segment of memory.
*
* \blocked
*
* The code snippet below illustrates the storing of a "blocked" arrangement
* of 512 integers across 128 threads (where each thread owns 4 consecutive items)
* into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
* meaning items are locally reordered among threads so that memory references will be
* efficiently coalesced using a warp-striped access pattern.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* // Specialize BlockStore for 128 threads owning 4 integer items each
* typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
*
* // Allocate shared memory for BlockStore
* __shared__ typename BlockStore::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Store items to linear memory
* int thread_data[4];
* BlockStore(temp_storage).Store(d_data, thread_data);
*
* \endcode
* \par
* Suppose the set of \p thread_data across the block of threads is
* <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
* The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
*
*/
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
InternalStore(temp_storage, linear_tid).Store(block_itr, items);
}
/**
* \brief Store items into a linear segment of memory, guarded by range.
*
* \blocked
*
* The code snippet below illustrates the guarded storing of a "blocked" arrangement
* of 512 integers across 128 threads (where each thread owns 4 consecutive items)
* into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
* meaning items are locally reordered among threads so that memory references will be
* efficiently coalesced using a warp-striped access pattern.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, int valid_items, ...)
* {
* // Specialize BlockStore for 128 threads owning 4 integer items each
* typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
*
* // Allocate shared memory for BlockStore
* __shared__ typename BlockStore::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Store items to linear memory
* int thread_data[4];
* BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
*
* \endcode
* \par
* Suppose the set of \p thread_data across the block of threads is
* <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
* The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
* only the first two threads being unmasked to store portions of valid data.
*
*/
__device__ __forceinline__ void Store(
OutputIteratorRA block_itr, ///< [in] The thread block's base output iterator for storing to
T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
int valid_items) ///< [in] Number of valid items to write
{
InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
}
};
/** @} */ // end group BlockModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,85 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
*/
#pragma once
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int BINS>
struct BlockHistogramAtomic
{
/// Shared memory storage layout type
struct TempStorage {};
/// Constructor
__device__ __forceinline__ BlockHistogramAtomic(
TempStorage &temp_storage,
int linear_tid)
{}
/// Composite data onto an existing histogram
template <
typename HistoCounter>
__device__ __forceinline__ void Composite(
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
{
// Update histogram
#pragma unroll
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
atomicAdd(histogram + items[i], 1);
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,197 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
*/
#pragma once
#include "../../block/block_radix_sort.cuh"
#include "../../block/block_discontinuity.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
*/
template <
typename T,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int BINS>
struct BlockHistogramSort
{
// Parameterize BlockRadixSort type for our thread block
typedef BlockRadixSort<T, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// Parameterize BlockDiscontinuity type for our thread block
typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
// Shared memory
union _TempStorage
{
// Storage for sorting bin values
typename BlockRadixSortT::TempStorage sort;
struct
{
// Storage for detecting discontinuities in the tile of sorted bin values
typename BlockDiscontinuityT::TempStorage flag;
// Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
unsigned int run_begin[BINS];
unsigned int run_end[BINS];
};
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Thread fields
_TempStorage &temp_storage;
int linear_tid;
/// Constructor
__device__ __forceinline__ BlockHistogramSort(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
// Discontinuity functor
struct DiscontinuityOp
{
// Reference to temp_storage
_TempStorage &temp_storage;
// Constructor
__device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
temp_storage(temp_storage)
{}
// Discontinuity predicate
__device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
{
if (a != b)
{
// Note the begin/end offsets in shared storage
temp_storage.run_begin[b] = b_index;
temp_storage.run_end[a] = b_index;
return true;
}
else
{
return false;
}
}
};
// Composite data onto an existing histogram
template <
typename HistoCounter>
__device__ __forceinline__ void Composite(
T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
{
enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
// Sort bytes in blocked arrangement
BlockRadixSortT(temp_storage.sort, linear_tid).Sort(items);
__syncthreads();
// Initialize the shared memory's run_begin and run_end for each bin
int histo_offset = 0;
#pragma unroll
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
{
temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
}
// Finish up with guarded initialization if necessary
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
{
temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
}
__syncthreads();
int flags[ITEMS_PER_THREAD]; // unused
// Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
DiscontinuityOp flag_op(temp_storage);
BlockDiscontinuityT(temp_storage.flag, linear_tid).FlagHeads(flags, items, flag_op);
// Update begin for first item
if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
__syncthreads();
// Composite into histogram
histo_offset = 0;
#pragma unroll
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
{
int thread_offset = histo_offset + linear_tid;
HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
histogram[thread_offset] += count;
}
// Finish up with guarded composition if necessary
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
{
int thread_offset = histo_offset + linear_tid;
HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
histogram[thread_offset] += count;
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,214 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
*/
#pragma once
#include "../../block/block_raking_layout.cuh"
#include "../../warp/warp_reduce.cuh"
#include "../../thread/thread_reduce.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
*/
template <
typename T, ///< Data type being reduced
int BLOCK_THREADS> ///< The thread block size in threads
struct BlockReduceRaking
{
/// Layout type for padded threadblock raking grid
typedef BlockRakingLayout<T, BLOCK_THREADS, 1> BlockRakingLayout;
/// WarpReduce utility type
typedef typename WarpReduce<T, 1, BlockRakingLayout::RAKING_THREADS>::InternalWarpReduce WarpReduce;
/// Constants
enum
{
/// Number of raking threads
RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
/// Number of raking elements per warp synchronous raking thread
SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
/// Cooperative work can be entirely warp synchronous
WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
/// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
WARP_SYNCHRONOUS_UNGUARDED = ((RAKING_THREADS & (RAKING_THREADS - 1)) == 0),
/// Whether or not accesses into smem are unguarded
RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
};
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Thread fields
_TempStorage &temp_storage;
int linear_tid;
/// Constructor
__device__ __forceinline__ BlockReduceRaking(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
/// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
template <bool FULL_TILE>
__device__ __forceinline__ T Sum(
T partial, ///< [in] Calling thread's input partial reductions
int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
{
cub::Sum reduction_op;
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE, SEGMENT_LENGTH>(
partial,
num_valid);
}
else
{
// Place partial into shared memory grid.
*BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
__syncthreads();
// Reduce parallelism to one warp
if (linear_tid < RAKING_THREADS)
{
// Raking reduction in grid
T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
partial = raking_segment[0];
#pragma unroll
for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
{
// Update partial if addend is in range
if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
{
partial = reduction_op(partial, raking_segment[ITEM]);
}
}
partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
partial,
num_valid);
}
}
return partial;
}
/// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
template <
bool FULL_TILE,
typename ReductionOp>
__device__ __forceinline__ T Reduce(
T partial, ///< [in] Calling thread's input partial reductions
int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE, SEGMENT_LENGTH>(
partial,
num_valid,
reduction_op);
}
else
{
// Place partial into shared memory grid.
*BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
__syncthreads();
// Reduce parallelism to one warp
if (linear_tid < RAKING_THREADS)
{
// Raking reduction in grid
T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
partial = raking_segment[0];
#pragma unroll
for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
{
// Update partial if addend is in range
if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
{
partial = reduction_op(partial, raking_segment[ITEM]);
}
}
partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
partial,
num_valid,
reduction_op);
}
}
return partial;
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,198 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
*/
#pragma once
#include "../../warp/warp_reduce.cuh"
#include "../../util_arch.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
*/
template <
typename T, ///< Data type being reduced
int BLOCK_THREADS> ///< The thread block size in threads
struct BlockReduceWarpReductions
{
/// Constants
enum
{
/// Number of active warps
WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
/// The logical warp size for warp reductions
LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
/// Whether or not the logical warp size evenly divides the threadblock size
EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
};
/// WarpReduce utility type
typedef typename WarpReduce<T, WARPS, LOGICAL_WARP_SIZE>::InternalWarpReduce WarpReduce;
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpReduce::TempStorage warp_reduce; ///< Buffer for warp-synchronous scan
T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan
T block_prefix; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Thread fields
_TempStorage &temp_storage;
int linear_tid;
int warp_id;
int lane_id;
/// Constructor
__device__ __forceinline__ BlockReduceWarpReductions(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid),
warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
0 :
linear_tid / PtxArchProps::WARP_THREADS),
lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
linear_tid :
linear_tid % PtxArchProps::WARP_THREADS)
{}
/// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
template <
bool FULL_TILE,
typename ReductionOp>
__device__ __forceinline__ T ApplyWarpAggregates(
ReductionOp reduction_op, ///< [in] Binary scan operator
T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
{
// Share lane aggregates
if (lane_id == 0)
{
temp_storage.warp_aggregates[warp_id] = warp_aggregate;
}
__syncthreads();
// Update total aggregate in warp 0, lane 0
if (linear_tid == 0)
{
#pragma unroll
for (int SUCCESSOR_WARP = 1; SUCCESSOR_WARP < WARPS; SUCCESSOR_WARP++)
{
if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
{
warp_aggregate = reduction_op(warp_aggregate, temp_storage.warp_aggregates[SUCCESSOR_WARP]);
}
}
}
return warp_aggregate;
}
/// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
template <bool FULL_TILE>
__device__ __forceinline__ T Sum(
T input, ///< [in] Calling thread's input partial reductions
int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
{
cub::Sum reduction_op;
unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE;
unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
LOGICAL_WARP_SIZE :
(warp_offset < num_valid) ?
num_valid - warp_offset :
0;
// Warp reduction in every warp
T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
input,
warp_num_valid);
// Update outputs and block_aggregate with warp-wide aggregates from lane-0s
return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
}
/// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread<sub>0</sub>.
template <
bool FULL_TILE,
typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input partial reductions
int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
unsigned int warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE);
unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE;
unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
LOGICAL_WARP_SIZE :
(warp_offset < num_valid) ?
num_valid - warp_offset :
0;
// Warp reduction in every warp
T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
input,
warp_num_valid,
reduction_op);
// Update outputs and block_aggregate with warp-wide aggregates from lane-0s
return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,761 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
*/
#pragma once
#include "../../util_arch.cuh"
#include "../../block/block_raking_layout.cuh"
#include "../../thread/thread_reduce.cuh"
#include "../../thread/thread_scan.cuh"
#include "../../warp/warp_scan.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
*/
template <
typename T, ///< Data type being scanned
int BLOCK_THREADS, ///< The thread block size in threads
bool MEMOIZE> ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
struct BlockScanRaking
{
/// Layout type for padded threadblock raking grid
typedef BlockRakingLayout<T, BLOCK_THREADS> BlockRakingLayout;
/// Constants
enum
{
/// Number of active warps
WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
/// Number of raking threads
RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
/// Number of raking elements per warp synchronous raking thread
SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
/// Cooperative work can be entirely warp synchronous
WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
};
/// WarpScan utility type
typedef WarpScan<T, 1, RAKING_THREADS> WarpScan;
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan
typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
T block_aggregate; ///< Block aggregate
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Thread fields
_TempStorage &temp_storage;
int linear_tid;
T cached_segment[SEGMENT_LENGTH];
/// Constructor
__device__ __forceinline__ BlockScanRaking(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid)
{}
/// Performs upsweep raking reduction, returning the aggregate
template <typename ScanOp>
__device__ __forceinline__ T Upsweep(
ScanOp scan_op)
{
T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
T *raking_ptr;
if (MEMOIZE)
{
// Copy data into registers
#pragma unroll
for (int i = 0; i < SEGMENT_LENGTH; i++)
{
cached_segment[i] = smem_raking_ptr[i];
}
raking_ptr = cached_segment;
}
else
{
raking_ptr = smem_raking_ptr;
}
T raking_partial = raking_ptr[0];
#pragma unroll
for (int i = 1; i < SEGMENT_LENGTH; i++)
{
if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + i) < BLOCK_THREADS))
{
raking_partial = scan_op(raking_partial, raking_ptr[i]);
}
}
return raking_partial;
}
/// Performs exclusive downsweep raking scan
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveDownsweep(
ScanOp scan_op,
T raking_partial,
bool apply_prefix = true)
{
T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
T *raking_ptr = (MEMOIZE) ?
cached_segment :
smem_raking_ptr;
ThreadScanExclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
if (MEMOIZE)
{
// Copy data back to smem
#pragma unroll
for (int i = 0; i < SEGMENT_LENGTH; i++)
{
smem_raking_ptr[i] = cached_segment[i];
}
}
}
/// Performs inclusive downsweep raking scan
template <typename ScanOp>
__device__ __forceinline__ void InclusiveDownsweep(
ScanOp scan_op,
T raking_partial,
bool apply_prefix = true)
{
T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
T *raking_ptr = (MEMOIZE) ?
cached_segment :
smem_raking_ptr;
ThreadScanInclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
if (MEMOIZE)
{
// Copy data back to smem
#pragma unroll
for (int i = 0; i < SEGMENT_LENGTH; i++)
{
smem_raking_ptr[i] = cached_segment[i];
}
}
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input items
T &output, ///< [out] Calling thread's output items (may be aliased to \p input)
const T &identity, ///< [in] Identity value
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
input,
output,
identity,
scan_op,
block_aggregate);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
raking_partial,
raking_partial,
identity,
scan_op,
temp_storage.block_aggregate);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, raking_partial);
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <
typename ScanOp,
typename BlockPrefixOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T identity, ///< [in] Identity value
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
input,
output,
identity,
scan_op,
block_aggregate,
block_prefix_op);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
raking_partial,
raking_partial,
identity,
scan_op,
temp_storage.block_aggregate,
block_prefix_op);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, raking_partial);
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
input,
output,
scan_op,
block_aggregate);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
raking_partial,
raking_partial,
scan_op,
temp_storage.block_aggregate);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <
typename ScanOp,
typename BlockPrefixOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
input,
output,
scan_op,
block_aggregate,
block_prefix_op);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
raking_partial,
raking_partial,
scan_op,
temp_storage.block_aggregate,
block_prefix_op);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, raking_partial);
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
__device__ __forceinline__ void ExclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
input,
output,
block_aggregate);
}
else
{
// Raking scan
Sum scan_op;
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
raking_partial,
raking_partial,
temp_storage.block_aggregate);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, raking_partial);
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename BlockPrefixOp>
__device__ __forceinline__ void ExclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
input,
output,
block_aggregate,
block_prefix_op);
}
else
{
// Raking scan
Sum scan_op;
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
raking_partial,
raking_partial,
temp_storage.block_aggregate,
block_prefix_op);
// Exclusive raking downsweep scan
ExclusiveDownsweep(scan_op, raking_partial);
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
input,
output,
scan_op,
block_aggregate);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
raking_partial,
raking_partial,
scan_op,
temp_storage.block_aggregate);
// Inclusive raking downsweep scan
InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <
typename ScanOp,
typename BlockPrefixOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
input,
output,
scan_op,
block_aggregate,
block_prefix_op);
}
else
{
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
raking_partial,
raking_partial,
scan_op,
temp_storage.block_aggregate,
block_prefix_op);
// Inclusive raking downsweep scan
InclusiveDownsweep(scan_op, raking_partial);
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
input,
output,
block_aggregate);
}
else
{
// Raking scan
Sum scan_op;
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Exclusive warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
raking_partial,
raking_partial,
temp_storage.block_aggregate);
// Inclusive raking downsweep scan
InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename BlockPrefixOp>
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
if (WARP_SYNCHRONOUS)
{
// Short-circuit directly to warp scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
input,
output,
block_aggregate,
block_prefix_op);
}
else
{
// Raking scan
Sum scan_op;
// Place thread partial into shared memory raking grid
T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
*placement_ptr = input;
__syncthreads();
// Reduce parallelism down to just raking threads
if (linear_tid < RAKING_THREADS)
{
// Raking upsweep reduction in grid
T raking_partial = Upsweep(scan_op);
// Warp synchronous scan
WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
raking_partial,
raking_partial,
temp_storage.block_aggregate,
block_prefix_op);
// Inclusive raking downsweep scan
InclusiveDownsweep(scan_op, raking_partial);
}
__syncthreads();
// Grab thread prefix from shared memory
output = *placement_ptr;
// Retrieve block aggregate
block_aggregate = temp_storage.block_aggregate;
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,342 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
#pragma once
#include "../../util_arch.cuh"
#include "../../warp/warp_scan.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
*/
template <
typename T,
int BLOCK_THREADS>
struct BlockScanWarpScans
{
/// Constants
enum
{
/// Number of active warps
WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
};
/// WarpScan utility type
typedef WarpScan<T, WARPS, PtxArchProps::WARP_THREADS> WarpScan;
/// Shared memory storage layout type
struct _TempStorage
{
typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan
T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan
T block_prefix; ///< Shared prefix for the entire threadblock
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Thread fields
_TempStorage &temp_storage;
int linear_tid;
int warp_id;
int lane_id;
/// Constructor
__device__ __forceinline__ BlockScanWarpScans(
TempStorage &temp_storage,
int linear_tid)
:
temp_storage(temp_storage.Alias()),
linear_tid(linear_tid),
warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
0 :
linear_tid / PtxArchProps::WARP_THREADS),
lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
linear_tid :
linear_tid % PtxArchProps::WARP_THREADS)
{}
/// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps. Also returns block-wide aggregate in <em>thread</em><sub>0</sub>.
template <typename ScanOp>
__device__ __forceinline__ void ApplyWarpAggregates(
T &partial, ///< [out] The calling thread's partial reduction
ScanOp scan_op, ///< [in] Binary scan operator
T warp_aggregate, ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items
bool lane_valid = true) ///< [in] Whether or not the partial belonging to the current thread is valid
{
// Share lane aggregates
temp_storage.warp_aggregates[warp_id] = warp_aggregate;
__syncthreads();
block_aggregate = temp_storage.warp_aggregates[0];
#pragma unroll
for (int WARP = 1; WARP < WARPS; WARP++)
{
if (warp_id == WARP)
{
partial = (lane_valid) ?
scan_op(block_aggregate, partial) : // fold it in our valid partial
block_aggregate; // replace our invalid partial with the aggregate
}
block_aggregate = scan_op(block_aggregate, temp_storage.warp_aggregates[WARP]);
}
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input items
T &output, ///< [out] Calling thread's output items (may be aliased to \p input)
const T &identity, ///< [in] Identity value
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
T warp_aggregate;
WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
// Update outputs and block_aggregate with warp-wide aggregates
ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <
typename ScanOp,
typename BlockPrefixOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T identity, ///< [in] Identity value
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
ExclusiveScan(input, output, identity, scan_op, block_aggregate);
// Compute and share threadblock prefix
if (warp_id == 0)
{
temp_storage.block_prefix = block_prefix_op(block_aggregate);
}
__syncthreads();
// Incorporate threadblock prefix into outputs
output = scan_op(temp_storage.block_prefix, output);
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
T warp_aggregate;
WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, scan_op, warp_aggregate);
// Update outputs and block_aggregate with warp-wide aggregates
ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate, (lane_id > 0));
}
/// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <
typename ScanOp,
typename BlockPrefixOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
ExclusiveScan(input, output, scan_op, block_aggregate);
// Compute and share threadblock prefix
if (warp_id == 0)
{
temp_storage.block_prefix = block_prefix_op(block_aggregate);
}
__syncthreads();
// Incorporate threadblock prefix into outputs
output = (linear_tid == 0) ?
temp_storage.block_prefix :
scan_op(temp_storage.block_prefix, output);
}
/// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
__device__ __forceinline__ void ExclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
T warp_aggregate;
WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveSum(input, output, warp_aggregate);
// Update outputs and block_aggregate with warp-wide aggregates from lane-0s
ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
}
/// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename BlockPrefixOp>
__device__ __forceinline__ void ExclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
ExclusiveSum(input, output, block_aggregate);
// Compute and share threadblock prefix
if (warp_id == 0)
{
temp_storage.block_prefix = block_prefix_op(block_aggregate);
}
__syncthreads();
// Incorporate threadblock prefix into outputs
Sum scan_op;
output = scan_op(temp_storage.block_prefix, output);
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
T warp_aggregate;
WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveScan(input, output, scan_op, warp_aggregate);
// Update outputs and block_aggregate with warp-wide aggregates from lane-0s
ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <
typename ScanOp,
typename BlockPrefixOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
InclusiveScan(input, output, scan_op, block_aggregate);
// Compute and share threadblock prefix
if (warp_id == 0)
{
temp_storage.block_prefix = block_prefix_op(block_aggregate);
}
__syncthreads();
// Incorporate threadblock prefix into outputs
output = scan_op(temp_storage.block_prefix, output);
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs.
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items
{
T warp_aggregate;
WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveSum(input, output, warp_aggregate);
// Update outputs and block_aggregate with warp-wide aggregates from lane-0s
ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
}
/// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs.
template <typename BlockPrefixOp>
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item
T &output, ///< [out] Calling thread's output item (may be aliased to \p input)
T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
BlockPrefixOp &block_prefix_op) ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
{
InclusiveSum(input, output, block_aggregate);
// Compute and share threadblock prefix
if (warp_id == 0)
{
temp_storage.block_prefix = block_prefix_op(block_aggregate);
}
__syncthreads();
// Incorporate threadblock prefix into outputs
Sum scan_op;
output = scan_op(temp_storage.block_prefix, output);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,84 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* CUB umbrella include file
*/
#pragma once
// Block
#include "block/block_histogram.cuh"
#include "block/block_discontinuity.cuh"
#include "block/block_exchange.cuh"
#include "block/block_load.cuh"
#include "block/block_radix_rank.cuh"
#include "block/block_radix_sort.cuh"
#include "block/block_reduce.cuh"
#include "block/block_scan.cuh"
#include "block/block_store.cuh"
// Device
#include "device/device_histogram.cuh"
#include "device/device_radix_sort.cuh"
#include "device/device_reduce.cuh"
#include "device/device_scan.cuh"
// Grid
//#include "grid/grid_barrier.cuh"
#include "grid/grid_even_share.cuh"
#include "grid/grid_mapping.cuh"
#include "grid/grid_queue.cuh"
// Host
#include "host/spinlock.cuh"
// Thread
#include "thread/thread_load.cuh"
#include "thread/thread_operators.cuh"
#include "thread/thread_reduce.cuh"
#include "thread/thread_scan.cuh"
#include "thread/thread_store.cuh"
// Warp
#include "warp/warp_reduce.cuh"
#include "warp/warp_scan.cuh"
// Util
#include "util_allocator.cuh"
#include "util_arch.cuh"
#include "util_debug.cuh"
#include "util_device.cuh"
#include "util_macro.cuh"
#include "util_ptx.cuh"
#include "util_type.cuh"
#include "util_iterator.cuh"
#include "util_vector.cuh"

View File

@ -0,0 +1,322 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockHistogramTiles implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
*/
#pragma once
#include <iterator>
#include "specializations/block_histo_tiles_gatomic.cuh"
#include "specializations/block_histo_tiles_satomic.cuh"
#include "specializations/block_histo_tiles_sort.cuh"
#include "../../util_type.cuh"
#include "../../grid/grid_mapping.cuh"
#include "../../grid/grid_even_share.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Algorithmic variants
******************************************************************************/
/**
* \brief BlockHistogramTilesAlgorithm enumerates alternative algorithms for BlockHistogramTiles.
*/
enum BlockHistogramTilesAlgorithm
{
/**
* \par Overview
* A two-kernel approach in which:
* -# Thread blocks in the first kernel aggregate their own privatized
* histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
* -# A single thread block in the second kernel reduces them into the output histogram(s).
*
* \par Performance Considerations
* Delivers consistent throughput regardless of sample bin distribution.
*
* However, because histograms are privatized in shared memory, a large
* number of bins (e.g., thousands) may adversely affect occupancy and
* performance (or even the ability to launch).
*/
GRID_HISTO_SORT,
/**
* \par Overview
* A two-kernel approach in which:
* -# Thread blocks in the first kernel aggregate their own privatized
* histograms using shared-memory \p atomicAdd().
* -# A single thread block in the second kernel reduces them into the
* output histogram(s).
*
* \par Performance Considerations
* Performance is strongly tied to the hardware implementation of atomic
* addition, and may be significantly degraded for non uniformly-random
* input distributions where many concurrent updates are likely to be
* made to the same bin counter.
*
* However, because histograms are privatized in shared memory, a large
* number of bins (e.g., thousands) may adversely affect occupancy and
* performance (or even the ability to launch).
*/
GRID_HISTO_SHARED_ATOMIC,
/**
* \par Overview
* A single-kernel approach in which thread blocks update the output histogram(s) directly
* using global-memory \p atomicAdd().
*
* \par Performance Considerations
* Performance is strongly tied to the hardware implementation of atomic
* addition, and may be significantly degraded for non uniformly-random
* input distributions where many concurrent updates are likely to be
* made to the same bin counter.
*
* Performance is not significantly impacted when computing histograms having large
* numbers of bins (e.g., thousands).
*/
GRID_HISTO_GLOBAL_ATOMIC,
};
/******************************************************************************
* Tuning policy
******************************************************************************/
/**
* Tuning policy for BlockHistogramTiles
*/
template <
int _BLOCK_THREADS,
int _ITEMS_PER_THREAD,
BlockHistogramTilesAlgorithm _GRID_ALGORITHM,
GridMappingStrategy _GRID_MAPPING,
int _SM_OCCUPANCY>
struct BlockHistogramTilesPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS,
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
SM_OCCUPANCY = _SM_OCCUPANCY,
};
static const BlockHistogramTilesAlgorithm GRID_ALGORITHM = _GRID_ALGORITHM;
static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING;
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* Implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
*/
template <
typename BlockHistogramTilesPolicy, ///< Tuning policy
int BINS, ///< Number of histogram bins per channel
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that can be cast as an integer in the range [0..BINS-1]
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
typename SizeT> ///< Integer type for offsets
struct BlockHistogramTiles
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Histogram grid algorithm
static const BlockHistogramTilesAlgorithm GRID_ALGORITHM = BlockHistogramTilesPolicy::GRID_ALGORITHM;
// Alternative internal implementation types
typedef BlockHistogramTilesSort< BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesSortT;
typedef BlockHistogramTilesSharedAtomic< BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesSharedAtomicT;
typedef BlockHistogramTilesGlobalAtomic< BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesGlobalAtomicT;
// Internal block sweep histogram type
typedef typename If<(GRID_ALGORITHM == GRID_HISTO_SORT),
BlockHistogramTilesSortT,
typename If<(GRID_ALGORITHM == GRID_HISTO_SHARED_ATOMIC),
BlockHistogramTilesSharedAtomicT,
BlockHistogramTilesGlobalAtomicT>::Type>::Type InternalBlockDelegate;
enum
{
TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
};
// Temporary storage type
typedef typename InternalBlockDelegate::TempStorage TempStorage;
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
// Internal block delegate
InternalBlockDelegate internal_delegate;
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__ BlockHistogramTiles(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorRA d_in, ///< Input data to reduce
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
:
internal_delegate(temp_storage, d_in, d_out_histograms)
{}
/**
* \brief Reduce a consecutive segment of input tiles
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
SizeT block_oob) ///< [in] Threadblock end offset (exclusive)
{
// Consume subsequent full tiles of input
while (block_offset + TILE_ITEMS <= block_oob)
{
internal_delegate.ConsumeTile<true>(block_offset);
block_offset += TILE_ITEMS;
}
// Consume a partially-full tile
if (block_offset < block_oob)
{
int valid_items = block_oob - block_offset;
internal_delegate.ConsumeTile<false>(block_offset, valid_items);
}
// Aggregate output
internal_delegate.AggregateOutput();
}
/**
* Reduce a consecutive segment of input tiles
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT num_items, ///< [in] Total number of global input items
GridEvenShare<SizeT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<SizeT> &queue, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
{
even_share.BlockInit();
ConsumeTiles(even_share.block_offset, even_share.block_oob);
}
/**
* Dequeue and reduce tiles of items as part of a inter-block scan
*/
__device__ __forceinline__ void ConsumeTiles(
int num_items, ///< Total number of input items
GridQueue<SizeT> queue) ///< Queue descriptor for assigning tiles of work to thread blocks
{
// Shared block offset
__shared__ SizeT shared_block_offset;
// We give each thread block at least one tile of input.
SizeT block_offset = blockIdx.x * TILE_ITEMS;
SizeT even_share_base = gridDim.x * TILE_ITEMS;
// Process full tiles of input
while (block_offset + TILE_ITEMS <= num_items)
{
internal_delegate.ConsumeTile<true>(block_offset);
// Dequeue up to TILE_ITEMS
if (threadIdx.x == 0)
shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
__syncthreads();
block_offset = shared_block_offset;
__syncthreads();
}
// Consume a partially-full tile
if (block_offset < num_items)
{
int valid_items = num_items - block_offset;
internal_delegate.ConsumeTile<false>(block_offset, valid_items);
}
// Aggregate output
internal_delegate.AggregateOutput();
}
/**
* Dequeue and reduce tiles of items as part of a inter-block scan
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT num_items, ///< [in] Total number of global input items
GridEvenShare<SizeT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<SizeT> &queue, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
{
ConsumeTiles(num_items, queue);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,381 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
*/
#pragma once
#include <iterator>
#include "scan_tiles_types.cuh"
#include "../../thread/thread_operators.cuh"
#include "../../block/block_load.cuh"
#include "../../block/block_store.cuh"
#include "../../block/block_scan.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_vector.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Tuning policy for BlockPartitionTiles
*/
template <
int _PARTITIONS,
int _BLOCK_THREADS,
int _ITEMS_PER_THREAD,
PtxLoadModifier _LOAD_MODIFIER,
BlockScanAlgorithm _SCAN_ALGORITHM>
struct BlockPartitionTilesPolicy
{
enum
{
PARTITIONS = _PARTITIONS,
BLOCK_THREADS = _BLOCK_THREADS,
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
};
static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
};
/**
* Tuple type for scanning partition membership flags
*/
template <
typename SizeT,
int PARTITIONS>
struct PartitionScanTuple;
/**
* Tuple type for scanning partition membership flags (specialized for 1 output partition)
*/
template <typename SizeT>
struct PartitionScanTuple<SizeT, 1> : VectorHelper<SizeT, 1>::Type
{
__device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
{
PartitionScanTuple retval;
retval.x = x + other.x;
return retval;
}
template <typename PredicateOp, typename T>
__device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
{
this->x = pred_op(val);
}
template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
__device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
{
if (pred_op(val))
d_out[this->x - 1] = val;
}
};
/**
* Tuple type for scanning partition membership flags (specialized for 2 output partitions)
*/
template <typename SizeT>
struct PartitionScanTuple<SizeT, 2> : VectorHelper<SizeT, 2>::Type
{
__device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
{
PartitionScanTuple retval;
retval.x = x + other.x;
retval.y = y + other.y;
return retval;
}
template <typename PredicateOp, typename T>
__device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
{
bool pred = pred_op(val);
this->x = pred;
this->y = !pred;
}
template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
__device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
{
SizeT scatter_offset = (pred_op(val)) ?
this->x - 1 :
num_items - this->y;
d_out[scatter_offset] = val;
}
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
*
* Implements a single-pass "domino" strategy with adaptive prefix lookback.
*/
template <
typename BlockPartitionTilesPolicy, ///< Tuning policy
typename InputIteratorRA, ///< Input iterator type
typename OutputIteratorRA, ///< Output iterator type
typename PredicateOp, ///< Partition predicate functor type
typename SizeT> ///< Offset integer type
struct BlockPartitionTiles
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Constants
enum
{
PARTITIONS = BlockPartitionTilesPolicy::PARTITIONS,
BLOCK_THREADS = BlockPartitionTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockPartitionTilesPolicy::ITEMS_PER_THREAD,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// Load modifier
static const PtxLoadModifier LOAD_MODIFIER = BlockPartitionTilesPolicy::LOAD_MODIFIER;
// Data type of input iterator
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Tuple type for scanning partition membership flags
typedef PartitionScanTuple<SizeT, PARTITIONS> PartitionScanTuple;
// Tile status descriptor type
typedef ScanTileDescriptor<PartitionScanTuple> ScanTileDescriptorT;
// Block scan type for scanning membership flag scan_tuples
typedef BlockScan<
PartitionScanTuple,
BlockPartitionTilesPolicy::BLOCK_THREADS,
BlockPartitionTilesPolicy::SCAN_ALGORITHM> BlockScanT;
// Callback type for obtaining inter-tile prefix during block scan
typedef DeviceScanBlockPrefixOp<PartitionScanTuple, Sum> InterblockPrefixOp;
// Shared memory type for this threadblock
struct TempStorage
{
typename InterblockPrefixOp::TempStorage prefix; // Smem needed for cooperative prefix callback
typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
SizeT tile_idx; // Shared tile index
};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
TempStorage &temp_storage; ///< Reference to temp_storage
InputIteratorRA d_in; ///< Input data
OutputIteratorRA d_out; ///< Output data
ScanTileDescriptorT *d_tile_status; ///< Global list of tile status
PredicateOp pred_op; ///< Unary predicate operator indicating membership in the first partition
SizeT num_items; ///< Total number of input items
//---------------------------------------------------------------------
// Constructor
//---------------------------------------------------------------------
// Constructor
__device__ __forceinline__
BlockPartitionTiles(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorRA d_in, ///< Input data
OutputIteratorRA d_out, ///< Output data
ScanTileDescriptorT *d_tile_status, ///< Global list of tile status
PredicateOp pred_op, ///< Unary predicate operator indicating membership in the first partition
SizeT num_items) ///< Total number of input items
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_out(d_out),
d_tile_status(d_tile_status),
pred_op(pred_op),
num_items(num_items)
{}
//---------------------------------------------------------------------
// Domino scan
//---------------------------------------------------------------------
/**
* Process a tile of input
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
int tile_idx, ///< Tile index
SizeT block_offset, ///< Tile offset
PartitionScanTuple &partition_ends) ///< Running total
{
T items[ITEMS_PER_THREAD];
PartitionScanTuple scan_tuples[ITEMS_PER_THREAD];
// Load items
int valid_items = num_items - block_offset;
if (FULL_TILE)
LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
else
LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
// Prevent hoisting
// __syncthreads();
// __threadfence_block();
// Set partition membership flags in scan scan_tuples
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
scan_tuples[ITEM].SetFlags(pred_op, items[ITEM]);
}
// Perform inclusive scan over scan scan_tuples
PartitionScanTuple block_aggregate;
if (tile_idx == 0)
{
BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate);
partition_ends = block_aggregate;
// Update tile status if there are successor tiles
if (FULL_TILE && (threadIdx.x == 0))
ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
}
else
{
InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, Sum(), tile_idx);
BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate, prefix_op);
partition_ends = prefix_op.inclusive_prefix;
}
// Scatter items
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
// Scatter if not out-of-bounds
if (FULL_TILE || (threadIdx.x + (ITEM * BLOCK_THREADS) < valid_items))
{
scan_tuples[ITEM].Scatter(pred_op, items[ITEM], d_out, num_items);
}
}
}
/**
* Dequeue and scan tiles of items as part of a domino scan
*/
__device__ __forceinline__ void ConsumeTiles(
GridQueue<int> queue, ///< [in] Queue descriptor for assigning tiles of work to thread blocks
SizeT num_tiles, ///< [in] Total number of input tiles
PartitionScanTuple &partition_ends, ///< [out] Running partition end offsets
bool &is_last_tile) ///< [out] Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
{
#if CUB_PTX_ARCH < 200
// No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
int tile_idx = blockIdx.x;
SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
if (block_offset + TILE_ITEMS <= num_items)
{
ConsumeTile<true>(tile_idx, block_offset, partition_ends);
}
else if (block_offset < num_items)
{
ConsumeTile<false>(tile_idx, block_offset, partition_ends);
}
is_last_tile = (tile_idx == num_tiles - 1);
#else
// Get first tile
if (threadIdx.x == 0)
temp_storage.tile_idx = queue.Drain(1);
__syncthreads();
int tile_idx = temp_storage.tile_idx;
SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
while (block_offset + TILE_ITEMS <= num_items)
{
// Consume full tile
ConsumeTile<true>(tile_idx, block_offset, partition_ends);
is_last_tile = (tile_idx == num_tiles - 1);
// Get next tile
if (threadIdx.x == 0)
temp_storage.tile_idx = queue.Drain(1);
__syncthreads();
tile_idx = temp_storage.tile_idx;
block_offset = SizeT(TILE_ITEMS) * tile_idx;
}
// Consume a partially-full tile
if (block_offset < num_items)
{
ConsumeTile<false>(tile_idx, block_offset, partition_ends);
is_last_tile = (tile_idx == num_tiles - 1);
}
#endif
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,713 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* BlockRadixSortDownsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
*/
#pragma once
#include "../../thread/thread_load.cuh"
#include "../../block/block_load.cuh"
#include "../../block/block_store.cuh"
#include "../../block/block_radix_rank.cuh"
#include "../../block/block_exchange.cuh"
#include "../../util_type.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Types of scattering strategies
*/
enum RadixSortScatterAlgorithm
{
RADIX_SORT_SCATTER_DIRECT, ///< Scatter directly from registers to global bins
RADIX_SORT_SCATTER_TWO_PHASE, ///< First scatter from registers into shared memory bins, then into global bins
};
/**
* Tuning policy for BlockRadixSortDownsweepTiles
*/
template <
int _BLOCK_THREADS, ///< The number of threads per CTA
int _ITEMS_PER_THREAD, ///< The number of consecutive downsweep keys to process per thread
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
PtxLoadModifier _LOAD_MODIFIER, ///< The PTX cache-modifier to use for loads
bool _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
bool _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
BlockScanAlgorithm _INNER_SCAN_ALGORITHM, ///< The cub::BlockScanAlgorithm algorithm to use
RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use
cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
struct BlockRadixSortDownsweepTilesPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS,
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING = _EXCHANGE_TIME_SLICING,
RADIX_BITS = _RADIX_BITS,
MEMOIZE_OUTER_SCAN = _MEMOIZE_OUTER_SCAN,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = _INNER_SCAN_ALGORITHM;
static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = _SCATTER_ALGORITHM;
static const cudaSharedMemConfig SMEM_CONFIG = _SMEM_CONFIG;
typedef BlockRadixSortDownsweepTilesPolicy<
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_ALGORITHM,
LOAD_MODIFIER,
EXCHANGE_TIME_SLICING,
MEMOIZE_OUTER_SCAN,
INNER_SCAN_ALGORITHM,
SCATTER_ALGORITHM,
SMEM_CONFIG,
CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* CTA-wide "downsweep" abstraction for distributing keys from
* a range of input tiles.
*/
template <
typename BlockRadixSortDownsweepTilesPolicy,
typename Key,
typename Value,
typename SizeT>
struct BlockRadixSortDownsweepTiles
{
//---------------------------------------------------------------------
// Type definitions and constants
//---------------------------------------------------------------------
// Appropriate unsigned-bits representation of Key
typedef typename Traits<Key>::UnsignedBits UnsignedBits;
static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRadixSortDownsweepTilesPolicy::LOAD_ALGORITHM;
static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortDownsweepTilesPolicy::LOAD_MODIFIER;
static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRadixSortDownsweepTilesPolicy::INNER_SCAN_ALGORITHM;
static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRadixSortDownsweepTilesPolicy::SCATTER_ALGORITHM;
static const cudaSharedMemConfig SMEM_CONFIG = BlockRadixSortDownsweepTilesPolicy::SMEM_CONFIG;
enum
{
BLOCK_THREADS = BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockRadixSortDownsweepTilesPolicy::ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING = BlockRadixSortDownsweepTilesPolicy::EXCHANGE_TIME_SLICING,
RADIX_BITS = BlockRadixSortDownsweepTilesPolicy::RADIX_BITS,
MEMOIZE_OUTER_SCAN = BlockRadixSortDownsweepTilesPolicy::MEMOIZE_OUTER_SCAN,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
RADIX_DIGITS = 1 << RADIX_BITS,
KEYS_ONLY = Equals<Value, NullType>::VALUE,
WARP_THREADS = PtxArchProps::LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
BYTES_PER_SIZET = sizeof(SizeT),
LOG_BYTES_PER_SIZET = Log2<BYTES_PER_SIZET>::VALUE,
LOG_SMEM_BANKS = PtxArchProps::LOG_SMEM_BANKS,
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
SCATTER_PASSES = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
LOG_STORE_TXN_THREADS = LOG_SMEM_BANKS,
STORE_TXN_THREADS = 1 << LOG_STORE_TXN_THREADS,
};
// BlockRadixRank type
typedef BlockRadixRank<
BLOCK_THREADS,
RADIX_BITS,
MEMOIZE_OUTER_SCAN,
INNER_SCAN_ALGORITHM,
SMEM_CONFIG> BlockRadixRank;
// BlockLoad type (keys)
typedef BlockLoad<
UnsignedBits*,
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_ALGORITHM,
LOAD_MODIFIER,
EXCHANGE_TIME_SLICING> BlockLoadKeys;
// BlockLoad type (values)
typedef BlockLoad<
Value*,
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_ALGORITHM,
LOAD_MODIFIER,
EXCHANGE_TIME_SLICING> BlockLoadValues;
// BlockExchange type (keys)
typedef BlockExchange<
UnsignedBits,
BLOCK_THREADS,
ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING> BlockExchangeKeys;
// BlockExchange type (values)
typedef BlockExchange<
Value,
BLOCK_THREADS,
ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING> BlockExchangeValues;
/**
* Shared memory storage layout
*/
struct _TempStorage
{
SizeT relative_bin_offsets[RADIX_DIGITS + 1];
bool short_circuit;
union
{
typename BlockRadixRank::TempStorage ranking;
typename BlockLoadKeys::TempStorage load_keys;
typename BlockLoadValues::TempStorage load_values;
typename BlockExchangeKeys::TempStorage exchange_keys;
typename BlockExchangeValues::TempStorage exchange_values;
};
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Thread fields
//---------------------------------------------------------------------
// Shared storage for this CTA
_TempStorage &temp_storage;
// Input and output device pointers
UnsignedBits *d_keys_in;
UnsignedBits *d_keys_out;
Value *d_values_in;
Value *d_values_out;
// The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
SizeT bin_offset;
// The least-significant bit position of the current digit to extract
int current_bit;
// Whether to short-ciruit
bool short_circuit;
//---------------------------------------------------------------------
// Utility methods
//---------------------------------------------------------------------
/**
* Decodes given keys to lookup digit offsets in shared memory
*/
__device__ __forceinline__ void DecodeRelativeBinOffsets(
UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD])
{
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, RADIX_BITS);
// Lookup base digit offset from shared memory
relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
}
}
/**
* Scatter ranked items to global memory
*/
template <bool FULL_TILE, typename T>
__device__ __forceinline__ void ScatterItems(
T (&items)[ITEMS_PER_THREAD],
int (&local_ranks)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD],
T *d_out,
SizeT valid_items)
{
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
// Scatter if not out-of-bounds
if (FULL_TILE || (local_ranks[ITEM] < valid_items))
{
d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
}
}
}
/**
* Scatter ranked keys directly to global memory
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ScatterKeys(
UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD],
int (&ranks)[ITEMS_PER_THREAD],
SizeT valid_items,
Int2Type<RADIX_SORT_SCATTER_DIRECT> scatter_algorithm)
{
// Compute scatter offsets
DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
// Untwiddle keys before outputting
UnsignedBits keys[ITEMS_PER_THREAD];
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
}
// Scatter to global
ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
}
/**
* Scatter ranked keys through shared memory, then to global memory
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ScatterKeys(
UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD],
int (&ranks)[ITEMS_PER_THREAD],
SizeT valid_items,
Int2Type<RADIX_SORT_SCATTER_TWO_PHASE> scatter_algorithm)
{
// Exchange keys through shared memory
BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
// Compute striped local ranks
int local_ranks[ITEMS_PER_THREAD];
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
}
// Scatter directly
ScatterKeys<FULL_TILE>(
twiddled_keys,
relative_bin_offsets,
local_ranks,
valid_items,
Int2Type<RADIX_SORT_SCATTER_DIRECT>());
}
/**
* Scatter ranked values directly to global memory
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ScatterValues(
Value (&values)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD],
int (&ranks)[ITEMS_PER_THREAD],
SizeT valid_items,
Int2Type<RADIX_SORT_SCATTER_DIRECT> scatter_algorithm)
{
// Scatter to global
ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
}
/**
* Scatter ranked values through shared memory, then to global memory
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ScatterValues(
Value (&values)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD],
int (&ranks)[ITEMS_PER_THREAD],
SizeT valid_items,
Int2Type<RADIX_SORT_SCATTER_TWO_PHASE> scatter_algorithm)
{
__syncthreads();
// Exchange keys through shared memory
BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
// Compute striped local ranks
int local_ranks[ITEMS_PER_THREAD];
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
}
// Scatter directly
ScatterValues<FULL_TILE>(
values,
relative_bin_offsets,
local_ranks,
valid_items,
Int2Type<RADIX_SORT_SCATTER_DIRECT>());
}
/**
* Load a tile of items (specialized for full tile)
*/
template <typename BlockLoadT, typename T>
__device__ __forceinline__ void LoadItems(
BlockLoadT &block_loader,
T (&items)[ITEMS_PER_THREAD],
T *d_in,
SizeT valid_items,
Int2Type<true> is_full_tile)
{
block_loader.Load(d_in, items);
}
/**
* Load a tile of items (specialized for partial tile)
*/
template <typename BlockLoadT, typename T>
__device__ __forceinline__ void LoadItems(
BlockLoadT &block_loader,
T (&items)[ITEMS_PER_THREAD],
T *d_in,
SizeT valid_items,
Int2Type<false> is_full_tile)
{
block_loader.Load(d_in, items, valid_items);
}
/**
* Truck along associated values
*/
template <bool FULL_TILE, typename _Value>
__device__ __forceinline__ void GatherScatterValues(
_Value (&values)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD],
int (&ranks)[ITEMS_PER_THREAD],
SizeT block_offset,
SizeT valid_items)
{
BlockLoadValues loader(temp_storage.load_values);
LoadItems(
loader,
values,
d_values_in + block_offset,
valid_items,
Int2Type<FULL_TILE>());
ScatterValues<FULL_TILE>(
values,
relative_bin_offsets,
ranks,
valid_items,
Int2Type<SCATTER_ALGORITHM>());
}
/**
* Truck along associated values (specialized for key-only sorting)
*/
template <bool FULL_TILE>
__device__ __forceinline__ void GatherScatterValues(
NullType (&values)[ITEMS_PER_THREAD],
SizeT (&relative_bin_offsets)[ITEMS_PER_THREAD],
int (&ranks)[ITEMS_PER_THREAD],
SizeT block_offset,
SizeT valid_items)
{}
/**
* Process tile
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ProcessTile(
SizeT block_offset,
const SizeT &valid_items = TILE_ITEMS)
{
// Per-thread tile data
UnsignedBits keys[ITEMS_PER_THREAD]; // Keys
UnsignedBits twiddled_keys[ITEMS_PER_THREAD]; // Twiddled keys
int ranks[ITEMS_PER_THREAD]; // For each key, the local rank within the CTA
SizeT relative_bin_offsets[ITEMS_PER_THREAD]; // For each key, the global scatter base offset of the corresponding digit
if (LOAD_ALGORITHM != BLOCK_LOAD_DIRECT) __syncthreads();
// Assign max-key to all keys
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
keys[ITEM] = MAX_KEY;
}
// Load tile of keys
BlockLoadKeys loader(temp_storage.load_keys);
LoadItems(
loader,
keys,
d_keys_in + block_offset,
valid_items,
Int2Type<FULL_TILE>());
__syncthreads();
// Twiddle key bits if necessary
#pragma unroll
for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
{
twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
}
// Rank the twiddled keys
int inclusive_digit_prefix;
BlockRadixRank(temp_storage.ranking).RankKeys(
twiddled_keys,
ranks,
current_bit,
inclusive_digit_prefix);
// Update global scatter base offsets for each digit
if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
{
int exclusive_digit_prefix;
// Get exclusive digit prefix from inclusive prefix
#if CUB_PTX_ARCH >= 300
exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
if (threadIdx.x == 0)
exclusive_digit_prefix = 0;
#else
volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
exchange[threadIdx.x] = 0;
exchange[threadIdx.x + 1] = inclusive_digit_prefix;
exclusive_digit_prefix = exchange[threadIdx.x];
#endif
bin_offset -= exclusive_digit_prefix;
temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
bin_offset += inclusive_digit_prefix;
}
__syncthreads();
// Scatter keys
ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
// Gather/scatter values
Value values[ITEMS_PER_THREAD];
GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
}
/**
* Copy tiles within the range of input
*/
template <typename T>
__device__ __forceinline__ void Copy(
T *d_in,
T *d_out,
SizeT block_offset,
SizeT block_oob)
{
// Simply copy the input
while (block_offset + TILE_ITEMS <= block_oob)
{
T items[ITEMS_PER_THREAD];
LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
__syncthreads();
StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
block_offset += TILE_ITEMS;
}
// Clean up last partial tile with guarded-I/O
if (block_offset < block_oob)
{
SizeT valid_items = block_oob - block_offset;
T items[ITEMS_PER_THREAD];
LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
__syncthreads();
StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
}
}
/**
* Copy tiles within the range of input (specialized for NullType)
*/
__device__ __forceinline__ void Copy(
NullType *d_in,
NullType *d_out,
SizeT block_offset,
SizeT block_oob)
{}
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortDownsweepTiles(
TempStorage &temp_storage,
SizeT bin_offset,
Key *d_keys_in,
Key *d_keys_out,
Value *d_values_in,
Value *d_values_out,
int current_bit)
:
temp_storage(temp_storage.Alias()),
bin_offset(bin_offset),
d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
d_values_in(d_values_in),
d_values_out(d_values_out),
current_bit(current_bit),
short_circuit(false)
{}
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortDownsweepTiles(
TempStorage &temp_storage,
SizeT num_items,
SizeT *d_spine,
Key *d_keys_in,
Key *d_keys_out,
Value *d_values_in,
Value *d_values_out,
int current_bit)
:
temp_storage(temp_storage.Alias()),
d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
d_values_in(d_values_in),
d_values_out(d_values_out),
current_bit(current_bit)
{
// Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
if (threadIdx.x < RADIX_DIGITS)
{
// Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
SizeT first_block_bin_offset = d_spine[gridDim.x * threadIdx.x];
int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
this->temp_storage.short_circuit = WarpAll(predicate);
// Load my block's bin offset for my bin
bin_offset = d_spine[(gridDim.x * threadIdx.x) + blockIdx.x];
}
__syncthreads();
short_circuit = this->temp_storage.short_circuit;
}
/**
* Distribute keys from a segment of input tiles.
*/
__device__ __forceinline__ void ProcessTiles(
SizeT block_offset,
const SizeT &block_oob)
{
if (short_circuit)
{
// Copy keys
Copy(d_keys_in, d_keys_out, block_offset, block_oob);
// Copy values
Copy(d_values_in, d_values_out, block_offset, block_oob);
}
else
{
// Process full tiles of tile_items
while (block_offset + TILE_ITEMS <= block_oob)
{
ProcessTile<true>(block_offset);
block_offset += TILE_ITEMS;
}
// Clean up last partial tile with guarded-I/O
if (block_offset < block_oob)
{
ProcessTile<false>(block_offset, block_oob - block_offset);
}
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,464 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
*/
#pragma once
#include "../../thread/thread_reduce.cuh"
#include "../../thread/thread_load.cuh"
#include "../../block/block_load.cuh"
#include "../../util_type.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Tuning policy for BlockRadixSortUpsweepTiles
*/
template <
int _BLOCK_THREADS, ///< The number of threads per CTA
int _ITEMS_PER_THREAD, ///< The number of items to load per thread per tile
PtxLoadModifier _LOAD_MODIFIER, ///< Load cache-modifier
int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
struct BlockRadixSortUpsweepTilesPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS,
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
RADIX_BITS = _RADIX_BITS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
typedef BlockRadixSortUpsweepTilesPolicy<
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_MODIFIER,
CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
*
* Computes radix digit histograms over a range of input tiles.
*/
template <
typename BlockRadixSortUpsweepTilesPolicy,
typename Key,
typename SizeT>
struct BlockRadixSortUpsweepTiles
{
//---------------------------------------------------------------------
// Type definitions and constants
//---------------------------------------------------------------------
typedef typename Traits<Key>::UnsignedBits UnsignedBits;
// Integer type for digit counters (to be packed into words of PackedCounters)
typedef unsigned char DigitCounter;
// Integer type for packing DigitCounters into columns of shared memory banks
typedef unsigned int PackedCounter;
static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortUpsweepTilesPolicy::LOAD_MODIFIER;
enum
{
RADIX_BITS = BlockRadixSortUpsweepTilesPolicy::RADIX_BITS,
BLOCK_THREADS = BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS,
KEYS_PER_THREAD = BlockRadixSortUpsweepTilesPolicy::ITEMS_PER_THREAD,
RADIX_DIGITS = 1 << RADIX_BITS,
LOG_WARP_THREADS = PtxArchProps::LOG_WARP_THREADS,
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD,
BYTES_PER_COUNTER = sizeof(DigitCounter),
LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter),
LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
COUNTER_LANES = 1 << LOG_COUNTER_LANES,
// To prevent counter overflow, we must periodically unpack and aggregate the
// digit counters back into registers. Each counter lane is assigned to a
// warp for aggregation.
LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
// Unroll tiles in batches without risk of counter overflow
UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD),
UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
};
/**
* Shared memory storage layout
*/
struct _TempStorage
{
union
{
DigitCounter digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
PackedCounter packed_counters[COUNTER_LANES][BLOCK_THREADS];
SizeT digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
};
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Thread fields (aggregate state bundle)
//---------------------------------------------------------------------
// Shared storage for this CTA
_TempStorage &temp_storage;
// Thread-local counters for periodically aggregating composite-counter lanes
SizeT local_counts[LANES_PER_WARP][PACKING_RATIO];
// Input and output device pointers
UnsignedBits *d_keys_in;
// The least-significant bit position of the current digit to extract
int current_bit;
//---------------------------------------------------------------------
// Helper structure for templated iteration
//---------------------------------------------------------------------
// Iterate
template <int COUNT, int MAX>
struct Iterate
{
enum {
HALF = (MAX / 2),
};
// BucketKeys
static __device__ __forceinline__ void BucketKeys(
BlockRadixSortUpsweepTiles &cta,
UnsignedBits keys[KEYS_PER_THREAD])
{
cta.Bucket(keys[COUNT]);
// Next
Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
}
// ProcessTiles
static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
{
// Next
Iterate<1, HALF>::ProcessTiles(cta, block_offset);
Iterate<1, MAX - HALF>::ProcessTiles(cta, block_offset + (HALF * TILE_ITEMS));
}
};
// Terminate
template <int MAX>
struct Iterate<MAX, MAX>
{
// BucketKeys
static __device__ __forceinline__ void BucketKeys(BlockRadixSortUpsweepTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
// ProcessTiles
static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
{
cta.ProcessFullTile(block_offset);
}
};
//---------------------------------------------------------------------
// Utility methods
//---------------------------------------------------------------------
/**
* Decode a key and increment corresponding smem digit counter
*/
__device__ __forceinline__ void Bucket(UnsignedBits key)
{
// Perform transform op
UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
// Add in sub-counter offset
UnsignedBits sub_counter = BFE(converted_key, current_bit, LOG_PACKING_RATIO);
// Add in row offset
UnsignedBits row_offset = BFE(converted_key, current_bit + LOG_PACKING_RATIO, LOG_COUNTER_LANES);
// Increment counter
temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
}
/**
* Reset composite counters
*/
__device__ __forceinline__ void ResetDigitCounters()
{
#pragma unroll
for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
{
temp_storage.packed_counters[LANE][threadIdx.x] = 0;
}
}
/**
* Reset the unpacked counters in each thread
*/
__device__ __forceinline__ void ResetUnpackedCounters()
{
#pragma unroll
for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
{
#pragma unroll
for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
{
local_counts[LANE][UNPACKED_COUNTER] = 0;
}
}
}
/**
* Extracts and aggregates the digit counters for each counter lane
* owned by this warp
*/
__device__ __forceinline__ void UnpackDigitCounts()
{
unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
#pragma unroll
for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
{
const int counter_lane = (LANE * WARPS) + warp_id;
if (counter_lane < COUNTER_LANES)
{
#pragma unroll
for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
{
#pragma unroll
for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
{
SizeT counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
local_counts[LANE][UNPACKED_COUNTER] += counter;
}
}
}
}
}
/**
* Places unpacked counters into smem for final digit reduction
*/
__device__ __forceinline__ void ReduceUnpackedCounts(SizeT &bin_count)
{
unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
// Place unpacked digit counters in shared memory
#pragma unroll
for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
{
int counter_lane = (LANE * WARPS) + warp_id;
if (counter_lane < COUNTER_LANES)
{
int digit_row = counter_lane << LOG_PACKING_RATIO;
#pragma unroll
for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
{
temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
local_counts[LANE][UNPACKED_COUNTER];
}
}
}
__syncthreads();
// Rake-reduce bin_count reductions
if (threadIdx.x < RADIX_DIGITS)
{
bin_count = ThreadReduce<WARP_THREADS>(
temp_storage.digit_partials[threadIdx.x],
Sum());
}
}
/**
* Processes a single, full tile
*/
__device__ __forceinline__ void ProcessFullTile(SizeT block_offset)
{
// Tile of keys
UnsignedBits keys[KEYS_PER_THREAD];
LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
// Prevent hoisting
// __threadfence_block();
// __syncthreads();
// Bucket tile of keys
Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
}
/**
* Processes a single load (may have some threads masked off)
*/
__device__ __forceinline__ void ProcessPartialTile(
SizeT block_offset,
const SizeT &block_oob)
{
// Process partial tile if necessary using single loads
block_offset += threadIdx.x;
while (block_offset < block_oob)
{
// Load and bucket key
UnsignedBits key = ThreadLoad<LOAD_MODIFIER>(d_keys_in + block_offset);
Bucket(key);
block_offset += BLOCK_THREADS;
}
}
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortUpsweepTiles(
TempStorage &temp_storage,
Key *d_keys_in,
int current_bit)
:
temp_storage(temp_storage.Alias()),
d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
current_bit(current_bit)
{}
/**
* Compute radix digit histograms from a segment of input tiles.
*/
__device__ __forceinline__ void ProcessTiles(
SizeT block_offset,
const SizeT &block_oob,
SizeT &bin_count) ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
{
// Reset digit counters in smem and unpacked counters in registers
ResetDigitCounters();
ResetUnpackedCounters();
// Unroll batches of full tiles
while (block_offset + UNROLLED_ELEMENTS <= block_oob)
{
Iterate<0, UNROLL_COUNT>::ProcessTiles(*this, block_offset);
block_offset += UNROLLED_ELEMENTS;
__syncthreads();
// Aggregate back into local_count registers to prevent overflow
UnpackDigitCounts();
__syncthreads();
// Reset composite counters in lanes
ResetDigitCounters();
}
// Unroll single full tiles
while (block_offset + TILE_ITEMS <= block_oob)
{
ProcessFullTile(block_offset);
block_offset += TILE_ITEMS;
}
// Process partial tile if necessary
ProcessPartialTile(
block_offset,
block_oob);
__syncthreads();
// Aggregate back into local_count registers
UnpackDigitCounts();
__syncthreads();
// Final raking reduction of counts by bin
ReduceUnpackedCounts(bin_count);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,399 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
*/
#pragma once
#include <iterator>
#include "scan_tiles_types.cuh"
#include "../../block/block_load.cuh"
#include "../../block/block_discontinuity.cuh"
#include "../../block/block_scan.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Utility data types
******************************************************************************/
/// Scan tuple data type for reduce-value-by-key
template <typename Value, typename SizeT>
struct ReduceByKeyuple
{
Value value; // Initially set as value, contains segment aggregate after prefix scan
SizeT flag; // Initially set as a tail flag, contains scatter offset after prefix scan
};
/// Binary reduce-by-key scan operator
template <typename ReductionOp>
struct ReduceByKeyScanOp
{
/// Reduction functor
ReductionOp reduction_op;
/// Constructor
ReduceByKeyScanOp(ReductionOp reduction_op) : reduction_op(reduction_op)
{}
/// Binary scan operator
template <typename ReduceByKeyuple>
__device__ __forceinline__ ReduceByKeyuple operator()(
const ReduceByKeyuple &first,
const ReduceByKeyuple &second)
{
ReduceByKeyuple retval;
retval.val = (second.flag) ? second.val : reduction_op(first.val, second.val);
retval.flag = first.flag + second.flag;
return retval;
}
};
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Tuning policy for BlockReduceByKeyiles
*/
template <
int _BLOCK_THREADS,
int _ITEMS_PER_THREAD,
BlockLoadAlgorithm _LOAD_ALGORITHM,
bool _LOAD_WARP_TIME_SLICING,
PtxLoadModifier _LOAD_MODIFIER,
BlockScanAlgorithm _SCAN_ALGORITHM>
struct BlockReduceByKeyilesPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS,
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING,
};
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
*/
template <
typename BlockReduceByKeyilesPolicy, ///< Tuning policy
typename KeyInputIteratorRA, ///< Random-access input iterator type for keys
typename KeyOutputIteratorRA, ///< Random-access output iterator type for keys
typename ValueInputIteratorRA, ///< Random-access input iterator type for values
typename ValueOutputIteratorRA, ///< Random-access output iterator type for values
typename ReductionOp, ///< Reduction functor type
typename SizeT> ///< Offset integer type
struct BlockReduceByKeyiles
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Data types of input iterators
typedef typename std::iterator_traits<KeyInputIteratorRA>::value_type Key; // Key data type
typedef typename std::iterator_traits<ValueInputIteratorRA>::value_type Value; // Value data type
// Constants
enum
{
BLOCK_THREADS = BlockReduceByKeyilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
STATUS_PADDING = PtxArchProps::WARP_THREADS,
};
// Block load type for keys
typedef BlockLoad<
KeyInputIteratorRA,
BlockReduceByKeyilesPolicy::BLOCK_THREADS,
BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING> BlockLoadKeys;
// Block load type for values
typedef BlockLoad<
ValueInputIteratorRA,
BlockReduceByKeyilesPolicy::BLOCK_THREADS,
BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING> BlockLoadValues;
// Block discontinuity type for setting tail flags
typedef BlockDiscontinuity<Key, BLOCK_THREADS> BlockDiscontinuityKeys;
// Scan tuple type
typedef ReduceByKeyuple<Value, SizeT> ScanTuple;
// Tile status descriptor type
typedef ScanTileDescriptor<ScanTuple> ScanTileDescriptorT;
// Block scan functor type
typedef ReduceByKeyScanOp<ReductionOp> ScanOp;
// Block scan prefix callback type
typedef DeviceScanBlockPrefixOp<ScanTuple, ScanOp> PrefixCallback;
// Block scan type
typedef BlockScan<
ScanTuple,
BlockReduceByKeyilesPolicy::BLOCK_THREADS,
BlockReduceByKeyilesPolicy::SCAN_ALGORITHM> BlockScanT;
/// Shared memory type for this threadblock
struct _TempStorage
{
union
{
typename BlockLoadKeys::TempStorage load_keys; // Smem needed for loading tiles of keys
typename BlockLoadValues::TempStorage load_values; // Smem needed for loading tiles of values
struct
{
typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
typename PrefixCallback::TempStorage prefix; // Smem needed for cooperative prefix callback
};
};
typename BlockDiscontinuityKeys::TempStorage flagging; // Smem needed for tile scanning
SizeT tile_idx; // Shared tile index
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
_TempStorage &temp_storage; ///< Reference to temp_storage
KeyInputIteratorRA d_keys_in; ///< Key input data
KeyOutputIteratorRA d_keys_out; ///< Key output data
ValueInputIteratorRA d_values_in; ///< Value input data
ValueOutputIteratorRA d_values_out; ///< Value output data
ScanTileDescriptorT *d_tile_status; ///< Global list of tile status
ScanOp scan_op; ///< Binary scan operator
int num_tiles; ///< Total number of input tiles for the entire problem
SizeT num_items; ///< Total number of scan items for the entire problem
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
// Constructor
__device__ __forceinline__
BlockReduceByKeyiles(
TempStorage &temp_storage, ///< Reference to temp_storage
KeyInputIteratorRA d_keys_in, ///< Key input data
KeyOutputIteratorRA d_keys_out, ///< Key output data
ValueInputIteratorRA d_values_in, ///< Value input data
ValueOutputIteratorRA d_values_out, ///< Value output data
ScanTileDescriptorT *d_tile_status, ///< Global list of tile status
ReductionOp reduction_op, ///< Binary scan operator
int num_tiles, ///< Total number of input tiles for the entire problem
SizeT num_items) ///< Total number of scan items for the entire problem
:
temp_storage(temp_storage.Alias()),
d_keys_in(d_keys_in),
d_keys_out(d_keys_out),
d_values_in(d_values_in),
d_values_out(d_values_out),
d_tile_status(d_tile_status),
scan_op(reduction_op),
num_tiles(num_tiles),
num_items(num_items)
{}
/**
* Process a tile of input
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
int tile_idx, ///< Tile index
SizeT block_offset, ///< Tile offset
int valid_items = TILE_ITEMS) ///< Number of valid items in the tile
{
Key keys[ITEMS_PER_THREAD];
Value values[ITEMS_PER_THREAD];
int tail_flags[ITEMS_PER_THREAD];
ScanTuple scan_tuples[ITEMS_PER_THREAD];
// Load keys
if (FULL_TILE)
BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
else
BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items);
// Set tail flags
if (tile_idx == num_tiles - 1)
{
// Last tile
BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality());
}
else
{
// Preceding tiles require the first element of the next tile
Key tile_suffix_item;
if (threadIdx.x == 0)
tile_suffix_item = d_keys_in[block_offset + TILE_ITEMS];
BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality(), tile_suffix_item);
}
__syncthreads();
// Load values
if (FULL_TILE)
BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
else
BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
// Assemble scan tuples
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
scan_tuples[ITEM].value = values[ITEM];
scan_tuples[ITEM].flag = tail_flags[ITEM];
}
__syncthreads();
// Perform inclusive prefix scan
ScanTuple block_aggregate;
if (tile_idx == 0)
{
// Without prefix callback
BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate);
// Update tile status
if (threadIdx.x == 0)
ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
}
else
{
// With prefix callback
PrefixCallback prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate, prefix_op);
}
// Scatter flagged keys and values to output
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int tile_item = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
// Set the head flag on the last item in a partially-full tile
if (!FULL_TILE && (tile_item == valid_items - 1))
tail_flags[ITEM] = 1;
// Decrement scatter offset
scan_tuples[ITEM].flag--;
// Scatter key and aggregate value if flagged and in range
if ((FULL_TILE || (tile_item < valid_items)) && (tail_flags[ITEM]))
{
d_keys_out[scan_tuples[ITEM].flag] = keys[ITEM];
d_values_out[scan_tuples[ITEM].flag] = scan_tuples[ITEM].value;
}
}
}
/**
* Dequeue and scan tiles of elements
*/
__device__ __forceinline__ void ProcessTiles(GridQueue<int> queue) ///< Queue descriptor for assigning tiles of work to thread blocks
{
// We give each thread block at least one tile of input
int tile_idx = blockIdx.x;
// Consume full tiles of input
SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
while (block_offset + TILE_ITEMS <= num_items)
{
ConsumeTile<true>(tile_idx, block_offset);
// Get next tile
#if CUB_PTX_ARCH < 200
// No concurrent kernels allowed, so just stripe tiles
tile_idx += gridDim.x;
#else
// Concurrent kernels are allowed, so we must only use active blocks to dequeue tile indices
if (threadIdx.x == 0)
temp_storage.tile_idx = queue.Drain(1) + gridDim.x;
__syncthreads();
tile_idx = temp_storage.tile_idx;
#endif
block_offset = SizeT(TILE_ITEMS) * tile_idx;
}
// Consume a partially-full tile
if (block_offset < num_items)
{
// Consume a partially-full tile
int valid_items = num_items - block_offset;
ConsumeTile<false>(tile_idx, block_offset, valid_items);
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,375 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
*/
#pragma once
#include <iterator>
#include "../../block/block_load.cuh"
#include "../../block/block_reduce.cuh"
#include "../../grid/grid_mapping.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../grid/grid_even_share.cuh"
#include "../../util_vector.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Tuning policy for BlockReduceTiles
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread per tile of input
int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
PtxLoadModifier _LOAD_MODIFIER, ///< PTX load modifier
GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
struct BlockReduceTilesPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS,
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
};
static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING;
static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
*
* Each thread reduces only the values it loads. If \p FIRST_TILE, this
* partial reduction is stored into \p thread_aggregate. Otherwise it is
* accumulated into \p thread_aggregate.
*/
template <
typename BlockReduceTilesPolicy,
typename InputIteratorRA,
typename SizeT,
typename ReductionOp>
struct BlockReduceTiles
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
typedef typename std::iterator_traits<InputIteratorRA>::value_type T; // Type of input iterator
typedef VectorHelper<T, BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH> VecHelper; // Helper type for vectorizing loads of T
typedef typename VecHelper::Type VectorT; // Vector of T
// Constants
enum
{
BLOCK_THREADS = BlockReduceTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockReduceTilesPolicy::ITEMS_PER_THREAD,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
VECTOR_LOAD_LENGTH = BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH,
// Can vectorize according to the policy if the input iterator is a native pointer to a built-in primitive
CAN_VECTORIZE = (BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH > 1) &&
(IsPointer<InputIteratorRA>::VALUE) &&
(VecHelper::BUILT_IN),
};
static const PtxLoadModifier LOAD_MODIFIER = BlockReduceTilesPolicy::LOAD_MODIFIER;
static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockReduceTilesPolicy::BLOCK_ALGORITHM;
// Parameterized BlockReduce primitive
typedef BlockReduce<T, BLOCK_THREADS, BlockReduceTilesPolicy::BLOCK_ALGORITHM> BlockReduceT;
/// Shared memory type required by this thread block
typedef typename BlockReduceT::TempStorage _TempStorage;
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
T thread_aggregate; ///< Each thread's partial reduction
_TempStorage& temp_storage; ///< Reference to temp_storage
InputIteratorRA d_in; ///< Input data to reduce
ReductionOp reduction_op; ///< Binary reduction operator
int first_tile_size; ///< Size of first tile consumed
bool input_aligned; ///< Whether or not input is vector-aligned
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__ BlockReduceTiles(
TempStorage& temp_storage, ///< Reference to temp_storage
InputIteratorRA d_in, ///< Input data to reduce
ReductionOp reduction_op) ///< Binary reduction operator
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
reduction_op(reduction_op),
first_tile_size(0),
input_aligned(CAN_VECTORIZE && ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
{}
/**
* Process a single tile of input
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
SizeT block_offset, ///< The offset the tile to consume
int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
{
if (FULL_TILE)
{
T stripe_partial;
// Load full tile
if (input_aligned)
{
// Alias items as an array of VectorT and load it in striped fashion
enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
VectorT vec_items[WORDS];
// Load striped into vec items
VectorT* alias_ptr = reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH));
#pragma unroll
for (int i = 0; i < WORDS; ++i)
vec_items[i] = alias_ptr[BLOCK_THREADS * i];
// Reduce items within each thread stripe
stripe_partial = ThreadReduce<ITEMS_PER_THREAD>(
reinterpret_cast<T*>(vec_items),
reduction_op);
}
else
{
T items[ITEMS_PER_THREAD];
// Load items in striped fashion
LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
// Reduce items within each thread stripe
stripe_partial = ThreadReduce(items, reduction_op);
}
// Update running thread aggregate
thread_aggregate = (first_tile_size) ?
reduction_op(thread_aggregate, stripe_partial) : // Update
stripe_partial; // Assign
}
else
{
// Partial tile
int thread_offset = threadIdx.x;
if (!first_tile_size && (thread_offset < valid_items))
{
// Assign thread_aggregate
thread_aggregate = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
thread_offset += BLOCK_THREADS;
}
while (thread_offset < valid_items)
{
// Update thread aggregate
T item = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
thread_aggregate = reduction_op(thread_aggregate, item);
thread_offset += BLOCK_THREADS;
}
}
// Set first tile size if necessary
if (!first_tile_size)
first_tile_size = valid_items;
}
//---------------------------------------------------------------------
// Consume a contiguous segment of tiles
//---------------------------------------------------------------------
/**
* \brief Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
SizeT block_oob, ///< [in] Threadblock end offset (exclusive)
T &block_aggregate) ///< [out] Running total
{
// Consume subsequent full tiles of input
while (block_offset + TILE_ITEMS <= block_oob)
{
ConsumeTile<true>(block_offset);
block_offset += TILE_ITEMS;
}
// Consume a partially-full tile
if (block_offset < block_oob)
{
int valid_items = block_oob - block_offset;
ConsumeTile<false>(block_offset, valid_items);
}
// Compute block-wide reduction
block_aggregate = (first_tile_size < TILE_ITEMS) ?
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
}
/**
* Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT num_items, ///< [in] Total number of global input items
GridEvenShare<SizeT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<SizeT> &queue, ///< [in,out] GridQueue descriptor
T &block_aggregate, ///< [out] Running total
Int2Type<GRID_MAPPING_EVEN_SHARE> is_even_share) ///< [in] Marker type indicating this is an even-share mapping
{
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
// Consume input tiles
ConsumeTiles(even_share.block_offset, even_share.block_oob, block_aggregate);
}
//---------------------------------------------------------------------
// Dynamically consume tiles
//---------------------------------------------------------------------
/**
* Dequeue and reduce tiles of items as part of a inter-block scan
*/
__device__ __forceinline__ void ConsumeTiles(
int num_items, ///< Total number of input items
GridQueue<SizeT> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
T &block_aggregate) ///< [out] Running total
{
// Shared dequeue offset
__shared__ SizeT dequeue_offset;
// We give each thread block at least one tile of input.
SizeT block_offset = blockIdx.x * TILE_ITEMS;
SizeT even_share_base = gridDim.x * TILE_ITEMS;
if (block_offset + TILE_ITEMS <= num_items)
{
// Consume full tile of input
ConsumeTile<true>(block_offset);
// Dequeue more tiles
while (true)
{
// Dequeue a tile of items
if (threadIdx.x == 0)
dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
__syncthreads();
// Grab tile offset and check if we're done with full tiles
block_offset = dequeue_offset;
__syncthreads();
if (block_offset + TILE_ITEMS > num_items)
break;
// Consume a full tile
ConsumeTile<true>(block_offset);
}
}
if (block_offset < num_items)
{
int valid_items = num_items - block_offset;
ConsumeTile<false>(block_offset, valid_items);
}
// Compute block-wide reduction
block_aggregate = (first_tile_size < TILE_ITEMS) ?
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
}
/**
* Dequeue and reduce tiles of items as part of a inter-block scan
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT num_items, ///< [in] Total number of global input items
GridEvenShare<SizeT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<SizeT> &queue, ///< [in,out] GridQueue descriptor
T &block_aggregate, ///< [out] Running total
Int2Type<GRID_MAPPING_DYNAMIC> is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping
{
ConsumeTiles(num_items, queue, block_aggregate);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,509 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
*/
#pragma once
#include <iterator>
#include "scan_tiles_types.cuh"
#include "../../block/block_load.cuh"
#include "../../block/block_store.cuh"
#include "../../block/block_scan.cuh"
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Tuning policy types
******************************************************************************/
/**
* Tuning policy for BlockScanTiles
*/
template <
int _BLOCK_THREADS,
int _ITEMS_PER_THREAD,
BlockLoadAlgorithm _LOAD_ALGORITHM,
bool _LOAD_WARP_TIME_SLICING,
PtxLoadModifier _LOAD_MODIFIER,
BlockStoreAlgorithm _STORE_ALGORITHM,
bool _STORE_WARP_TIME_SLICING,
BlockScanAlgorithm _SCAN_ALGORITHM>
struct BlockScanTilesPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS,
ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING,
STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,
};
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
};
/******************************************************************************
* Thread block abstractions
******************************************************************************/
/**
* \brief BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
*
* Implements a single-pass "domino" strategy with adaptive prefix lookback.
*/
template <
typename BlockScanTilesPolicy, ///< Tuning policy
typename InputIteratorRA, ///< Input iterator type
typename OutputIteratorRA, ///< Output iterator type
typename ScanOp, ///< Scan functor type
typename Identity, ///< Identity element type (cub::NullType for inclusive scan)
typename SizeT> ///< Offset integer type
struct BlockScanTiles
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Data type of input iterator
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Constants
enum
{
INCLUSIVE = Equals<Identity, NullType>::VALUE, // Inclusive scan if no identity type is provided
BLOCK_THREADS = BlockScanTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockScanTilesPolicy::ITEMS_PER_THREAD,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// Block load type
typedef BlockLoad<
InputIteratorRA,
BlockScanTilesPolicy::BLOCK_THREADS,
BlockScanTilesPolicy::ITEMS_PER_THREAD,
BlockScanTilesPolicy::LOAD_ALGORITHM,
BlockScanTilesPolicy::LOAD_MODIFIER,
BlockScanTilesPolicy::LOAD_WARP_TIME_SLICING> BlockLoadT;
// Block store type
typedef BlockStore<
OutputIteratorRA,
BlockScanTilesPolicy::BLOCK_THREADS,
BlockScanTilesPolicy::ITEMS_PER_THREAD,
BlockScanTilesPolicy::STORE_ALGORITHM,
STORE_DEFAULT,
BlockScanTilesPolicy::STORE_WARP_TIME_SLICING> BlockStoreT;
// Tile status descriptor type
typedef ScanTileDescriptor<T> ScanTileDescriptorT;
// Block scan type
typedef BlockScan<
T,
BlockScanTilesPolicy::BLOCK_THREADS,
BlockScanTilesPolicy::SCAN_ALGORITHM> BlockScanT;
// Callback type for obtaining inter-tile prefix during block scan
typedef DeviceScanBlockPrefixOp<T, ScanOp> InterblockPrefixOp;
// Shared memory type for this threadblock
struct _TempStorage
{
union
{
typename BlockLoadT::TempStorage load; // Smem needed for tile loading
typename BlockStoreT::TempStorage store; // Smem needed for tile storing
struct
{
typename InterblockPrefixOp::TempStorage prefix; // Smem needed for cooperative prefix callback
typename BlockScanT::TempStorage scan; // Smem needed for tile scanning
};
};
SizeT tile_idx; // Shared tile index
};
// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
_TempStorage &temp_storage; ///< Reference to temp_storage
InputIteratorRA d_in; ///< Input data
OutputIteratorRA d_out; ///< Output data
ScanOp scan_op; ///< Binary scan operator
Identity identity; ///< Identity element
//---------------------------------------------------------------------
// Block scan utility methods (first tile)
//---------------------------------------------------------------------
/**
* Exclusive scan specialization
*/
template <typename _ScanOp, typename _Identity>
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
{
BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
}
/**
* Exclusive sum specialization
*/
template <typename _Identity>
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
{
BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
}
/**
* Inclusive scan specialization
*/
template <typename _ScanOp>
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
{
BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
}
/**
* Inclusive sum specialization
*/
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
{
BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
}
//---------------------------------------------------------------------
// Block scan utility methods (subsequent tiles)
//---------------------------------------------------------------------
/**
* Exclusive scan specialization (with prefix from predecessors)
*/
template <typename _ScanOp, typename _Identity, typename PrefixCallback>
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
{
BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
}
/**
* Exclusive sum specialization (with prefix from predecessors)
*/
template <typename _Identity, typename PrefixCallback>
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
{
BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
}
/**
* Inclusive scan specialization (with prefix from predecessors)
*/
template <typename _ScanOp, typename PrefixCallback>
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
{
BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
}
/**
* Inclusive sum specialization (with prefix from predecessors)
*/
template <typename PrefixCallback>
__device__ __forceinline__
void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
{
BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
}
//---------------------------------------------------------------------
// Constructor
//---------------------------------------------------------------------
// Constructor
__device__ __forceinline__
BlockScanTiles(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorRA d_in, ///< Input data
OutputIteratorRA d_out, ///< Output data
ScanOp scan_op, ///< Binary scan operator
Identity identity) ///< Identity element
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_out(d_out),
scan_op(scan_op),
identity(identity)
{}
//---------------------------------------------------------------------
// Domino scan
//---------------------------------------------------------------------
/**
* Process a tile of input (domino scan)
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
SizeT num_items, ///< Total number of input items
int tile_idx, ///< Tile index
SizeT block_offset, ///< Tile offset
ScanTileDescriptorT *d_tile_status) ///< Global list of tile status
{
// Load items
T items[ITEMS_PER_THREAD];
if (FULL_TILE)
BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
else
BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_items - block_offset);
__syncthreads();
T block_aggregate;
if (tile_idx == 0)
{
ScanBlock(items, scan_op, identity, block_aggregate);
// Update tile status if there are successor tiles
if (FULL_TILE && (threadIdx.x == 0))
ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
}
else
{
InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
}
__syncthreads();
// Store items
if (FULL_TILE)
BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
else
BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_items - block_offset);
}
/**
* Dequeue and scan tiles of items as part of a domino scan
*/
__device__ __forceinline__ void ConsumeTiles(
int num_items, ///< Total number of input items
GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
ScanTileDescriptorT *d_tile_status) ///< Global list of tile status
{
#if CUB_PTX_ARCH < 200
// No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
int tile_idx = blockIdx.x;
SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
if (block_offset + TILE_ITEMS <= num_items)
ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
else if (block_offset < num_items)
ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
#else
// Get first tile
if (threadIdx.x == 0)
temp_storage.tile_idx = queue.Drain(1);
__syncthreads();
int tile_idx = temp_storage.tile_idx;
SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
while (block_offset + TILE_ITEMS <= num_items)
{
// Consume full tile
ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
// Get next tile
if (threadIdx.x == 0)
temp_storage.tile_idx = queue.Drain(1);
__syncthreads();
tile_idx = temp_storage.tile_idx;
block_offset = SizeT(TILE_ITEMS) * tile_idx;
}
// Consume a partially-full tile
if (block_offset < num_items)
{
ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
}
#endif
}
//---------------------------------------------------------------------
// Even-share scan
//---------------------------------------------------------------------
/**
* Process a tile of input
*/
template <
bool FULL_TILE,
bool FIRST_TILE>
__device__ __forceinline__ void ConsumeTile(
SizeT block_offset, ///< Tile offset
RunningBlockPrefixOp<T> &prefix_op, ///< Running prefix operator
int valid_items = TILE_ITEMS) ///< Number of valid items in the tile
{
// Load items
T items[ITEMS_PER_THREAD];
if (FULL_TILE)
BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
else
BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
__syncthreads();
// Block scan
T block_aggregate;
if (FIRST_TILE)
{
ScanBlock(items, scan_op, identity, block_aggregate);
prefix_op.running_total = block_aggregate;
}
else
{
ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
}
__syncthreads();
// Store items
if (FULL_TILE)
BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
else
BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
}
/**
* Scan a consecutive share of input tiles
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
SizeT block_oob) ///< [in] Threadblock end offset (exclusive)
{
RunningBlockPrefixOp<T> prefix_op;
if (block_offset + TILE_ITEMS <= block_oob)
{
// Consume first tile of input (full)
ConsumeTile<true, true>(block_offset, prefix_op);
block_offset += TILE_ITEMS;
// Consume subsequent full tiles of input
while (block_offset + TILE_ITEMS <= block_oob)
{
ConsumeTile<true, false>(block_offset, prefix_op);
block_offset += TILE_ITEMS;
}
// Consume a partially-full tile
if (block_offset < block_oob)
{
int valid_items = block_oob - block_offset;
ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
}
}
else
{
// Consume the first tile of input (partially-full)
int valid_items = block_oob - block_offset;
ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
}
}
/**
* Scan a consecutive share of input tiles, seeded with the specified prefix value
*/
__device__ __forceinline__ void ConsumeTiles(
SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
SizeT block_oob, ///< [in] Threadblock end offset (exclusive)
T prefix) ///< [in] The prefix to apply to the scan segment
{
RunningBlockPrefixOp<T> prefix_op;
prefix_op.running_total = prefix;
// Consume full tiles of input
while (block_offset + TILE_ITEMS <= block_oob)
{
ConsumeTile<true, false>(block_offset, prefix_op);
block_offset += TILE_ITEMS;
}
// Consume a partially-full tile
if (block_offset < block_oob)
{
int valid_items = block_oob - block_offset;
ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,318 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Utility types for device-wide scan
*/
#pragma once
#include <iterator>
#include "../../thread/thread_load.cuh"
#include "../../thread/thread_store.cuh"
#include "../../warp/warp_reduce.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* Enumerations of tile status
*/
enum ScanTileStatus
{
SCAN_TILE_OOB, // Out-of-bounds (e.g., padding)
SCAN_TILE_INVALID, // Not yet processed
SCAN_TILE_PARTIAL, // Tile aggregate is available
SCAN_TILE_PREFIX, // Inclusive tile prefix is available
};
/**
* Data type of tile status descriptor.
*
* Specialized for scan status and value types that can be combined into the same
* machine word that can be read/written coherently in a single access.
*/
template <
typename T,
bool SINGLE_WORD = (PowerOfTwo<sizeof(T)>::VALUE && (sizeof(T) <= 8))>
struct ScanTileDescriptor
{
// Status word type
typedef typename If<(sizeof(T) == 8),
long long,
typename If<(sizeof(T) == 4),
int,
typename If<(sizeof(T) == 2),
short,
char>::Type>::Type>::Type StatusWord;
// Vector word type
typedef typename If<(sizeof(T) == 8),
longlong2,
typename If<(sizeof(T) == 4),
int2,
typename If<(sizeof(T) == 2),
int,
short>::Type>::Type>::Type VectorWord;
T value;
StatusWord status;
static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
{
ScanTileDescriptor tile_descriptor;
tile_descriptor.status = SCAN_TILE_PREFIX;
tile_descriptor.value = prefix;
VectorWord alias;
*reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
}
static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
{
ScanTileDescriptor tile_descriptor;
tile_descriptor.status = SCAN_TILE_PARTIAL;
tile_descriptor.value = partial;
VectorWord alias;
*reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
}
static __device__ __forceinline__ void WaitForValid(
ScanTileDescriptor *ptr,
int &status,
T &value)
{
ScanTileDescriptor tile_descriptor;
while (true)
{
VectorWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<VectorWord*>(ptr));
tile_descriptor = *reinterpret_cast<ScanTileDescriptor*>(&alias);
if (tile_descriptor.status != SCAN_TILE_INVALID) break;
__threadfence_block();
}
status = tile_descriptor.status;
value = tile_descriptor.value;
}
};
/**
* Data type of tile status descriptor.
*
* Specialized for scan status and value types that cannot fused into
* the same machine word.
*/
template <typename T>
struct ScanTileDescriptor<T, false>
{
T prefix_value;
T partial_value;
/// Workaround for the fact that win32 doesn't guarantee 16B alignment 16B values of T
union
{
int status;
Uninitialized<T> padding;
};
static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
{
ThreadStore<STORE_CG>(&ptr->prefix_value, prefix);
__threadfence_block();
// __threadfence(); // __threadfence_block seems sufficient on current architectures to prevent reordeing
ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PREFIX);
}
static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
{
ThreadStore<STORE_CG>(&ptr->partial_value, partial);
__threadfence_block();
// __threadfence(); // __threadfence_block seems sufficient on current architectures to prevent reordeing
ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PARTIAL);
}
static __device__ __forceinline__ void WaitForValid(
ScanTileDescriptor *ptr,
int &status,
T &value)
{
while (true)
{
status = ThreadLoad<LOAD_CG>(&ptr->status);
if (status != SCAN_TILE_INVALID) break;
__threadfence_block();
}
value = (status == SCAN_TILE_PARTIAL) ?
ThreadLoad<LOAD_CG>(&ptr->partial_value) :
ThreadLoad<LOAD_CG>(&ptr->prefix_value);
}
};
/**
* Stateful prefix functor that provides the the running prefix for
* the current tile by using the callback warp to wait on on
* aggregates/prefixes from predecessor tiles to become available
*/
template <
typename T,
typename ScanOp>
struct DeviceScanBlockPrefixOp
{
// Parameterized warp reduce
typedef WarpReduce<T> WarpReduceT;
// Storage type
typedef typename WarpReduceT::TempStorage _TempStorage;
// Alias wrapper allowing storage to be unioned
typedef Uninitialized<_TempStorage> TempStorage;
// Tile status descriptor type
typedef ScanTileDescriptor<T> ScanTileDescriptorT;
// Fields
ScanTileDescriptorT *d_tile_status; ///< Pointer to array of tile status
_TempStorage &temp_storage; ///< Reference to a warp-reduction instance
ScanOp scan_op; ///< Binary scan operator
int tile_idx; ///< The current tile index
T inclusive_prefix; ///< Inclusive prefix for the tile
// Constructor
__device__ __forceinline__
DeviceScanBlockPrefixOp(
ScanTileDescriptorT *d_tile_status,
TempStorage &temp_storage,
ScanOp scan_op,
int tile_idx) :
d_tile_status(d_tile_status),
temp_storage(temp_storage.Alias()),
scan_op(scan_op),
tile_idx(tile_idx) {}
// Block until all predecessors within the specified window have non-invalid status
__device__ __forceinline__
void ProcessWindow(
int predecessor_idx,
int &predecessor_status,
T &window_aggregate)
{
T value;
ScanTileDescriptorT::WaitForValid(d_tile_status + predecessor_idx, predecessor_status, value);
// Perform a segmented reduction to get the prefix for the current window
int flag = (predecessor_status != SCAN_TILE_PARTIAL);
window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(value, flag, scan_op);
}
// Prefix functor (called by the first warp)
__device__ __forceinline__
T operator()(T block_aggregate)
{
// Update our status with our tile-aggregate
if (threadIdx.x == 0)
{
ScanTileDescriptorT::SetPartial(d_tile_status + tile_idx, block_aggregate);
}
// Wait for the window of predecessor tiles to become valid
int predecessor_idx = tile_idx - threadIdx.x - 1;
int predecessor_status;
T window_aggregate;
ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
// The exclusive tile prefix starts out as the current window aggregate
T exclusive_prefix = window_aggregate;
// Keep sliding the window back until we come across a tile whose inclusive prefix is known
while (WarpAll(predecessor_status != SCAN_TILE_PREFIX))
{
predecessor_idx -= PtxArchProps::WARP_THREADS;
// Update exclusive tile prefix with the window prefix
ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
}
// Compute the inclusive tile prefix and update the status for this tile
if (threadIdx.x == 0)
{
inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
ScanTileDescriptorT::SetPrefix(
d_tile_status + tile_idx,
inclusive_prefix);
}
// Return exclusive_prefix
return exclusive_prefix;
}
};
// Running scan prefix callback type for single-block scans.
// Maintains a running prefix that can be applied to consecutive
// scan operations.
template <typename T>
struct RunningBlockPrefixOp
{
// Running prefix
T running_total;
// Callback operator.
__device__ T operator()(T block_aggregate)
{
T old_prefix = running_total;
running_total += block_aggregate;
return old_prefix;
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,184 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
*/
#pragma once
#include <iterator>
#include "../../../util_type.cuh"
#include "../../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
*/
template <
typename BlockHistogramTilesPolicy, ///< Tuning policy
int BINS, ///< Number of histogram bins per channel
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that can be cast as an integer in the range [0..BINS-1]
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
typename SizeT> ///< Integer type for offsets
struct BlockHistogramTilesGlobalAtomic
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Sample type
typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
// Constants
enum
{
BLOCK_THREADS = BlockHistogramTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
};
// Shared memory type required by this thread block
typedef NullType TempStorage;
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
/// Reference to output histograms
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
/// Input data to reduce
InputIteratorRA d_in;
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__ BlockHistogramTilesGlobalAtomic(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorRA d_in, ///< Input data to reduce
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
:
d_in(d_in),
d_out_histograms(d_out_histograms)
{}
/**
* Process a single tile of input
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
SizeT block_offset, ///< The offset the tile to consume
int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
{
if (FULL_TILE)
{
// Full tile of samples to read and composite
SampleT items[ITEMS_PER_THREAD][CHANNELS];
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
{
if (CHANNEL < ACTIVE_CHANNELS)
{
items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
}
}
}
__threadfence_block();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
{
if (CHANNEL < ACTIVE_CHANNELS)
{
atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
}
}
}
}
else
{
// Only a partially-full tile of samples to read and composite
int bounds = valid_items - (threadIdx.x * CHANNELS);
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
{
if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
{
SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
atomicAdd(d_out_histograms[CHANNEL] + item, 1);
}
}
}
}
}
/**
* Aggregate results into output
*/
__device__ __forceinline__ void AggregateOutput()
{}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,237 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
*/
#pragma once
#include <iterator>
#include "../../../util_type.cuh"
#include "../../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
*/
template <
typename BlockHistogramTilesPolicy, ///< Tuning policy
int BINS, ///< Number of histogram bins
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that can be cast as an integer in the range [0..BINS-1]
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
typename SizeT> ///< Integer type for offsets
struct BlockHistogramTilesSharedAtomic
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Sample type
typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
// Constants
enum
{
BLOCK_THREADS = BlockHistogramTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
};
/// Shared memory type required by this thread block
struct _TempStorage
{
HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1]; // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
/// Reference to temp_storage
_TempStorage &temp_storage;
/// Reference to output histograms
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
/// Input data to reduce
InputIteratorRA d_in;
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__ BlockHistogramTilesSharedAtomic(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorRA d_in, ///< Input data to reduce
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_out_histograms(d_out_histograms)
{
// Initialize histogram bin counts to zeros
#pragma unroll
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
{
int histo_offset = 0;
#pragma unroll
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
{
this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
}
// Finish up with guarded initialization if necessary
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
{
this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
}
}
}
/**
* Process a single tile of input
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
SizeT block_offset, ///< The offset the tile to consume
int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
{
if (FULL_TILE)
{
// Full tile of samples to read and composite
SampleT items[ITEMS_PER_THREAD][CHANNELS];
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
{
if (CHANNEL < ACTIVE_CHANNELS)
{
items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
}
}
}
__threadfence_block();
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
{
if (CHANNEL < ACTIVE_CHANNELS)
{
atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
}
}
}
__threadfence_block();
}
else
{
// Only a partially-full tile of samples to read and composite
int bounds = valid_items - (threadIdx.x * CHANNELS);
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
#pragma unroll
for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
{
if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
{
SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
}
}
}
}
}
/**
* Aggregate results into output
*/
__device__ __forceinline__ void AggregateOutput()
{
// Barrier to ensure shared memory histograms are coherent
__syncthreads();
// Copy shared memory histograms to output
#pragma unroll
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
{
int channel_offset = (blockIdx.x * BINS);
int histo_offset = 0;
#pragma unroll
for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
{
d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
}
// Finish up with guarded initialization if necessary
if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
{
d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
}
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,364 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
*/
#pragma once
#include <iterator>
#include "../../../block/block_radix_sort.cuh"
#include "../../../block/block_discontinuity.cuh"
#include "../../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
*/
template <
typename BlockHistogramTilesPolicy, ///< Tuning policy
int BINS, ///< Number of histogram bins per channel
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that can be cast as an integer in the range [0..BINS-1]
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
typename SizeT> ///< Integer type for offsets
struct BlockHistogramTilesSort
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Sample type
typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
// Constants
enum
{
BLOCK_THREADS = BlockHistogramTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
};
// Parameterize BlockRadixSort type for our thread block
typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// Parameterize BlockDiscontinuity type for our thread block
typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
/// Shared memory type required by this thread block
union _TempStorage
{
// Storage for sorting bin values
typename BlockRadixSortT::TempStorage sort;
struct
{
// Storage for detecting discontinuities in the tile of sorted bin values
typename BlockDiscontinuityT::TempStorage flag;
// Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
};
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
// Discontinuity functor
struct DiscontinuityOp
{
// Reference to temp_storage
_TempStorage &temp_storage;
// Constructor
__device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
temp_storage(temp_storage)
{}
// Discontinuity predicate
__device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
{
if (a != b)
{
// Note the begin/end offsets in shared storage
temp_storage.run_begin[b] = b_index;
temp_storage.run_end[a] = b_index;
return true;
}
else
{
return false;
}
}
};
//---------------------------------------------------------------------
// Per-thread fields
//---------------------------------------------------------------------
/// Reference to temp_storage
_TempStorage &temp_storage;
/// Histogram counters striped across threads
HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
/// Reference to output histograms
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
/// Input data to reduce
InputIteratorRA d_in;
//---------------------------------------------------------------------
// Interface
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__ BlockHistogramTilesSort(
TempStorage &temp_storage, ///< Reference to temp_storage
InputIteratorRA d_in, ///< Input data to reduce
HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
:
temp_storage(temp_storage.Alias()),
d_in(d_in),
d_out_histograms(d_out_histograms)
{
// Initialize histogram counters striped across threads
#pragma unroll
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
{
#pragma unroll
for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
{
thread_counters[CHANNEL][COUNTER] = 0;
}
}
}
/**
* Composite a tile of input items
*/
__device__ __forceinline__ void Composite(
SampleT (&items)[ITEMS_PER_THREAD], ///< Tile of samples
HistoCounter thread_counters[STRIPED_COUNTERS_PER_THREAD]) ///< Histogram counters striped across threads
{
// Sort bytes in blocked arrangement
BlockRadixSortT(temp_storage.sort).Sort(items);
__syncthreads();
// Initialize the shared memory's run_begin and run_end for each bin
#pragma unroll
for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
{
temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
}
__syncthreads();
// Note the begin/end run offsets of bin runs in the sorted tile
int flags[ITEMS_PER_THREAD]; // unused
DiscontinuityOp flag_op(temp_storage);
BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
// Update begin for first item
if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
__syncthreads();
// Composite into histogram
// Initialize the shared memory's run_begin and run_end for each bin
#pragma unroll
for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
{
int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
HistoCounter run_length = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
thread_counters[COUNTER] += run_length;
}
}
/**
* Process one channel within a tile.
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTileChannel(
int channel,
SizeT block_offset,
int valid_items)
{
// Load items in striped fashion
if (FULL_TILE)
{
// Full tile of samples to read and composite
SampleT items[ITEMS_PER_THREAD];
// Unguarded loads
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
}
// Composite our histogram data
Composite(items, thread_counters[channel]);
}
else
{
// Only a partially-full tile of samples to read and composite
SampleT items[ITEMS_PER_THREAD];
// Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
int bounds = (valid_items - (threadIdx.x * CHANNELS));
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
0;
}
// Composite our histogram data
Composite(items, thread_counters[channel]);
__syncthreads();
// Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
if (threadIdx.x == 0)
{
int extra = (TILE_ITEMS - valid_items) / CHANNELS;
thread_counters[channel][0] -= extra;
}
}
}
/**
* Template iteration over channels (to silence not-unrolled warnings for SM10-13). Inductive step.
*/
template <bool FULL_TILE, int CHANNEL, int END>
struct IterateChannels
{
/**
* Process one channel within a tile.
*/
static __device__ __forceinline__ void ConsumeTileChannel(
BlockHistogramTilesSort *cta,
SizeT block_offset,
int valid_items)
{
__syncthreads();
cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
}
};
/**
* Template iteration over channels (to silence not-unrolled warnings for SM10-13). Base step.
*/
template <bool FULL_TILE, int END>
struct IterateChannels<FULL_TILE, END, END>
{
static __device__ __forceinline__ void ConsumeTileChannel(BlockHistogramTilesSort *cta, SizeT block_offset, int valid_items) {}
};
/**
* Process a single tile of input
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ConsumeTile(
SizeT block_offset, ///< The offset the tile to consume
int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
{
// First channel
ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
// Iterate through remaining channels
IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
}
/**
* Aggregate results into output
*/
__device__ __forceinline__ void AggregateOutput()
{
// Copy counters striped across threads into the histogram output
#pragma unroll
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
{
int channel_offset = (blockIdx.x * BINS);
#pragma unroll
for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
{
int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
{
d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
}
}
}
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,890 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DeviceRadixSort provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "block/block_radix_sort_upsweep_tiles.cuh"
#include "block/block_radix_sort_downsweep_tiles.cuh"
#include "block/block_scan_tiles.cuh"
#include "../grid/grid_even_share.cuh"
#include "../util_debug.cuh"
#include "../util_device.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/******************************************************************************
* Kernel entry points
*****************************************************************************/
/**
* Upsweep pass kernel entry point (multi-block). Computes privatized digit histograms, one per block.
*/
template <
typename BlockRadixSortUpsweepTilesPolicy, ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
typename Key, ///< Key type
typename SizeT> ///< Integer type used for global array indexing
__launch_bounds__ (int(BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS), 1)
__global__ void RadixSortUpsweepKernel(
Key *d_keys, ///< [in] Input keys buffer
SizeT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
SizeT num_items, ///< [in] Total number of input data items
int current_bit, ///< [in] Bit position of current radix digit
bool use_primary_bit_granularity, ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
bool first_pass, ///< [in] Whether this is the first digit pass
GridEvenShare<SizeT> even_share) ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
{
// Alternate policy for when fewer bits remain
typedef typename BlockRadixSortUpsweepTilesPolicy::AltPolicy AltPolicy;
// Parameterize two versions of BlockRadixSortUpsweepTiles type for the current configuration
typedef BlockRadixSortUpsweepTiles<BlockRadixSortUpsweepTilesPolicy, Key, SizeT> BlockRadixSortUpsweepTilesT; // Primary
typedef BlockRadixSortUpsweepTiles<AltPolicy, Key, SizeT> AltBlockRadixSortUpsweepTilesT; // Alternate (smaller bit granularity)
// Shared memory storage
__shared__ union
{
typename BlockRadixSortUpsweepTilesT::TempStorage pass_storage;
typename AltBlockRadixSortUpsweepTilesT::TempStorage alt_pass_storage;
} temp_storage;
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
// Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
if (use_primary_bit_granularity)
{
// Primary granularity
SizeT bin_count;
BlockRadixSortUpsweepTilesT(temp_storage.pass_storage, d_keys, current_bit).ProcessTiles(
even_share.block_offset,
even_share.block_oob,
bin_count);
// Write out digit counts (striped)
if (threadIdx.x < BlockRadixSortUpsweepTilesT::RADIX_DIGITS)
{
d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
}
}
else
{
// Alternate granularity
// Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
SizeT bin_count;
AltBlockRadixSortUpsweepTilesT(temp_storage.alt_pass_storage, d_keys, current_bit).ProcessTiles(
even_share.block_offset,
even_share.block_oob,
bin_count);
// Write out digit counts (striped)
if (threadIdx.x < AltBlockRadixSortUpsweepTilesT::RADIX_DIGITS)
{
d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
}
}
}
/**
* Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms
*/
template <
typename BlockScanTilesPolicy, ///< Tuning policy for cub::BlockScanTiles abstraction
typename SizeT> ///< Integer type used for global array indexing
__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS), 1)
__global__ void RadixSortScanKernel(
SizeT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
int num_counts) ///< [in] Total number of bin-counts
{
// Parameterize the BlockScanTiles type for the current configuration
typedef BlockScanTiles<BlockScanTilesPolicy, SizeT*, SizeT*, cub::Sum, SizeT, SizeT> BlockScanTilesT;
// Shared memory storage
__shared__ typename BlockScanTilesT::TempStorage temp_storage;
// Block scan instance
BlockScanTilesT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), SizeT(0)) ;
// Process full input tiles
int block_offset = 0;
RunningBlockPrefixOp<SizeT> prefix_op;
prefix_op.running_total = 0;
while (block_offset < num_counts)
{
block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
block_offset += BlockScanTilesT::TILE_ITEMS;
}
}
/**
* Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place.
*/
template <
typename BlockRadixSortDownsweepTilesPolicy, ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
typename Key, ///< Key type
typename Value, ///< Value type
typename SizeT> ///< Integer type used for global array indexing
__launch_bounds__ (int(BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS))
__global__ void RadixSortDownsweepKernel(
Key *d_keys_in, ///< [in] Input keys ping buffer
Key *d_keys_out, ///< [in] Output keys pong buffer
Value *d_values_in, ///< [in] Input values ping buffer
Value *d_values_out, ///< [in] Output values pong buffer
SizeT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
SizeT num_items, ///< [in] Total number of input data items
int current_bit, ///< [in] Bit position of current radix digit
bool use_primary_bit_granularity, ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
bool first_pass, ///< [in] Whether this is the first digit pass
bool last_pass, ///< [in] Whether this is the last digit pass
GridEvenShare<SizeT> even_share) ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
{
// Alternate policy for when fewer bits remain
typedef typename BlockRadixSortDownsweepTilesPolicy::AltPolicy AltPolicy;
// Parameterize two versions of BlockRadixSortDownsweepTiles type for the current configuration
typedef BlockRadixSortDownsweepTiles<BlockRadixSortDownsweepTilesPolicy, Key, Value, SizeT> BlockRadixSortDownsweepTilesT;
typedef BlockRadixSortDownsweepTiles<AltPolicy, Key, Value, SizeT> AltBlockRadixSortDownsweepTilesT;
// Shared memory storage
__shared__ union
{
typename BlockRadixSortDownsweepTilesT::TempStorage pass_storage;
typename AltBlockRadixSortDownsweepTilesT::TempStorage alt_pass_storage;
} temp_storage;
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
if (use_primary_bit_granularity)
{
// Process input tiles
BlockRadixSortDownsweepTilesT(temp_storage.pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
even_share.block_offset,
even_share.block_oob);
}
else
{
// Process input tiles
AltBlockRadixSortDownsweepTilesT(temp_storage.alt_pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
even_share.block_offset,
even_share.block_oob);
}
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* DeviceRadixSort
*****************************************************************************/
/**
* \brief DeviceRadixSort provides operations for computing a device-wide, parallel radix sort across data items residing within global memory. ![](sorting_logo.png)
* \ingroup DeviceModule
*
* \par Overview
* The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
* items into ascending order. It relies upon a positional representation for
* keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
* characters, etc.) specified from least-significant to most-significant. For a
* given input sequence of keys and a set of rules specifying a total ordering
* of the symbolic alphabet, the radix sorting method produces a lexicographic
* ordering of those keys.
*
* \par
* DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
* <tt>unsigned char</tt>, \p int, \p double, etc. Although the direct radix sorting
* method can only be applied to unsigned integral types, BlockRadixSort
* is able to sort signed and floating-point types via simple bit-wise transformations
* that ensure lexicographic key ordering.
*
* \par Usage Considerations
* \cdp_class{DeviceRadixSort}
*
* \par Performance
*
* \image html lsd_sort_perf.png
*
*/
struct DeviceRadixSort
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/******************************************************************************
* Constants and typedefs
******************************************************************************/
/// Generic structure for encapsulating dispatch properties codified in block policy.
struct KernelDispachParams
{
int block_threads;
int items_per_thread;
cudaSharedMemConfig smem_config;
int radix_bits;
int alt_radix_bits;
int subscription_factor;
int tile_size;
template <typename SortBlockPolicy>
__host__ __device__ __forceinline__
void InitUpsweepPolicy(int subscription_factor = 1)
{
block_threads = SortBlockPolicy::BLOCK_THREADS;
items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
radix_bits = SortBlockPolicy::RADIX_BITS;
alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
smem_config = cudaSharedMemBankSizeFourByte;
this->subscription_factor = subscription_factor;
tile_size = block_threads * items_per_thread;
}
template <typename ScanBlockPolicy>
__host__ __device__ __forceinline__
void InitScanPolicy()
{
block_threads = ScanBlockPolicy::BLOCK_THREADS;
items_per_thread = ScanBlockPolicy::ITEMS_PER_THREAD;
radix_bits = 0;
alt_radix_bits = 0;
smem_config = cudaSharedMemBankSizeFourByte;
subscription_factor = 0;
tile_size = block_threads * items_per_thread;
}
template <typename SortBlockPolicy>
__host__ __device__ __forceinline__
void InitDownsweepPolicy(int subscription_factor = 1)
{
block_threads = SortBlockPolicy::BLOCK_THREADS;
items_per_thread = SortBlockPolicy::ITEMS_PER_THREAD;
radix_bits = SortBlockPolicy::RADIX_BITS;
alt_radix_bits = SortBlockPolicy::AltPolicy::RADIX_BITS;
smem_config = SortBlockPolicy::SMEM_CONFIG;
this->subscription_factor = subscription_factor;
tile_size = block_threads * items_per_thread;
}
};
/******************************************************************************
* Tuning policies
******************************************************************************/
/// Specializations of tuned policy types for different PTX architectures
template <typename Key, typename Value, typename SizeT, int ARCH>
struct TunedPolicies;
/// SM35 tune
template <typename Key, typename Value, typename SizeT>
struct TunedPolicies<Key, Value, SizeT, 350>
{
enum {
KEYS_ONLY = (Equals<Value, NullType>::VALUE),
SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
RADIX_BITS = 5,
};
// UpsweepPolicy
typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
/*
// 4bit
typedef BlockRadixSortUpsweepTilesPolicy <128, 15, LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
typedef BlockRadixSortUpsweepTilesPolicy <256, 13, LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
*/
// ScanPolicy
typedef BlockScanTilesPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
// DownsweepPolicy
typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
/*
// 4bit
typedef BlockRadixSortDownsweepTilesPolicy <128, 15, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
typedef BlockRadixSortDownsweepTilesPolicy <256, 13, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
*/
enum { SUBSCRIPTION_FACTOR = 7 };
};
/// SM20 tune
template <typename Key, typename Value, typename SizeT>
struct TunedPolicies<Key, Value, SizeT, 200>
{
enum {
KEYS_ONLY = (Equals<Value, NullType>::VALUE),
SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
RADIX_BITS = 5,
};
// UpsweepPolicy
typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
// ScanPolicy
typedef BlockScanTilesPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
// DownsweepPolicy
typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
enum { SUBSCRIPTION_FACTOR = 3 };
};
/// SM10 tune
template <typename Key, typename Value, typename SizeT>
struct TunedPolicies<Key, Value, SizeT, 100>
{
enum {
RADIX_BITS = 4,
};
// UpsweepPolicy
typedef BlockRadixSortUpsweepTilesPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
// ScanPolicy
typedef BlockScanTilesPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
// DownsweepPolicy
typedef BlockRadixSortDownsweepTilesPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
enum { SUBSCRIPTION_FACTOR = 3 };
};
/******************************************************************************
* Default policy initializer
******************************************************************************/
/// Tuning policy for the PTX architecture that DeviceRadixSort operations will get dispatched to
template <typename Key, typename Value, typename SizeT>
struct PtxDefaultPolicies
{
static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
350 :
(CUB_PTX_ARCH >= 200) ?
200 :
100;
// Tuned policy set for the current PTX compiler pass
typedef TunedPolicies<Key, Value, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
// UpsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct UpsweepPolicy : PtxTunedPolicies::UpsweepPolicy {};
// ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
// DownsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct DownsweepPolicy : PtxTunedPolicies::DownsweepPolicy {};
// Subscription factor for the current PTX compiler pass
enum { SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR };
/**
* Initialize dispatch params with the policies corresponding to the PTX assembly we will use
*/
static void InitDispatchParams(
int ptx_version,
KernelDispachParams &upsweep_dispatch_params,
KernelDispachParams &scan_dispatch_params,
KernelDispachParams &downsweep_dispatch_params)
{
if (ptx_version >= 350)
{
typedef TunedPolicies<Key, Value, SizeT, 350> TunedPolicies;
upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
}
else if (ptx_version >= 200)
{
typedef TunedPolicies<Key, Value, SizeT, 200> TunedPolicies;
upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
}
else
{
typedef TunedPolicies<Key, Value, SizeT, 100> TunedPolicies;
upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
}
}
};
/******************************************************************************
* Utility methods
******************************************************************************/
/**
* Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
*/
template <
typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
typename SpineKernelPtr, ///< Function type of cub::SpineScanKernel
typename DownsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel
typename Key, ///< Key type
typename Value, ///< Value type
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
SpineKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
KernelDispachParams &upsweep_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
KernelDispachParams &scan_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
KernelDispachParams &downsweep_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
SizeT num_items, ///< [in] Number of items to reduce
int begin_bit = 0, ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported );
#else
cudaError error = cudaSuccess;
do
{
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Get a rough estimate of downsweep_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
int downsweep_sm_occupancy = CUB_MIN(
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / downsweep_dispatch_params.block_threads);
int upsweep_sm_occupancy = downsweep_sm_occupancy;
#ifndef __CUDA_ARCH__
// We're on the host, so come up with more accurate estimates of SM occupancy from actual device properties
Device device_props;
if (CubDebug(error = device_props.Init(device_ordinal))) break;
if (CubDebug(error = device_props.MaxSmOccupancy(
downsweep_sm_occupancy,
downsweep_kernel,
downsweep_dispatch_params.block_threads))) break;
if (CubDebug(error = device_props.MaxSmOccupancy(
upsweep_sm_occupancy,
upsweep_kernel,
upsweep_dispatch_params.block_threads))) break;
#endif
// Get device occupancies
int downsweep_occupancy = downsweep_sm_occupancy * sm_count;
// Get even-share work distribution descriptor
GridEvenShare<SizeT> even_share;
int max_downsweep_grid_size = downsweep_occupancy * downsweep_dispatch_params.subscription_factor;
int downsweep_grid_size;
even_share.GridInit(num_items, max_downsweep_grid_size, downsweep_dispatch_params.tile_size);
downsweep_grid_size = even_share.grid_size;
// Get number of spine elements (round up to nearest spine scan kernel tile size)
int bins = 1 << downsweep_dispatch_params.radix_bits;
int spine_size = downsweep_grid_size * bins;
int spine_tiles = (spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
spine_size = spine_tiles * scan_dispatch_params.tile_size;
int alt_bins = 1 << downsweep_dispatch_params.alt_radix_bits;
int alt_spine_size = downsweep_grid_size * alt_bins;
int alt_spine_tiles = (alt_spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
alt_spine_size = alt_spine_tiles * scan_dispatch_params.tile_size;
// Temporary storage allocation requirements
void* allocations[1];
size_t allocation_sizes[1] =
{
spine_size * sizeof(SizeT), // bytes needed for privatized block digit histograms
};
// Alias temporaries (or set the necessary size of the storage allocation)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
return cudaSuccess;
// Privatized per-block digit histograms
SizeT *d_spine = (SizeT*) allocations[0];
#ifndef __CUDA_ARCH__
// Get current smem bank configuration
cudaSharedMemConfig original_smem_config;
if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
cudaSharedMemConfig current_smem_config = original_smem_config;
#endif
// Iterate over digit places
int current_bit = begin_bit;
while (current_bit < end_bit)
{
// Use primary bit granularity if bits remaining is a whole multiple of bit primary granularity
int bits_remaining = end_bit - current_bit;
bool use_primary_bit_granularity = (bits_remaining % downsweep_dispatch_params.radix_bits == 0);
int radix_bits = (use_primary_bit_granularity) ?
downsweep_dispatch_params.radix_bits :
downsweep_dispatch_params.alt_radix_bits;
#ifndef __CUDA_ARCH__
// Update smem config if necessary
if (current_smem_config != upsweep_dispatch_params.smem_config)
{
if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_dispatch_params.smem_config))) break;
current_smem_config = upsweep_dispatch_params.smem_config;
}
#endif
// Log upsweep_kernel configuration
if (stream_synchronous)
CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
downsweep_grid_size, upsweep_dispatch_params.block_threads, (long long) stream, upsweep_dispatch_params.smem_config, upsweep_dispatch_params.items_per_thread, upsweep_sm_occupancy, d_keys.selector, current_bit, radix_bits);
// Invoke upsweep_kernel with same grid size as downsweep_kernel
upsweep_kernel<<<downsweep_grid_size, upsweep_dispatch_params.block_threads, 0, stream>>>(
d_keys.d_buffers[d_keys.selector],
d_spine,
num_items,
current_bit,
use_primary_bit_granularity,
(current_bit == begin_bit),
even_share);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log scan_kernel configuration
if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
1, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread);
// Invoke scan_kernel
scan_kernel<<<1, scan_dispatch_params.block_threads, 0, stream>>>(
d_spine,
(use_primary_bit_granularity) ? spine_size : alt_spine_size);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
#ifndef __CUDA_ARCH__
// Update smem config if necessary
if (current_smem_config != downsweep_dispatch_params.smem_config)
{
if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_dispatch_params.smem_config))) break;
current_smem_config = downsweep_dispatch_params.smem_config;
}
#endif
// Log downsweep_kernel configuration
if (stream_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
downsweep_grid_size, downsweep_dispatch_params.block_threads, (long long) stream, downsweep_dispatch_params.smem_config, downsweep_dispatch_params.items_per_thread, downsweep_sm_occupancy);
// Invoke downsweep_kernel
downsweep_kernel<<<downsweep_grid_size, downsweep_dispatch_params.block_threads, 0, stream>>>(
d_keys.d_buffers[d_keys.selector],
d_keys.d_buffers[d_keys.selector ^ 1],
d_values.d_buffers[d_values.selector],
d_values.d_buffers[d_values.selector ^ 1],
d_spine,
num_items,
current_bit,
use_primary_bit_granularity,
(current_bit == begin_bit),
(current_bit + downsweep_dispatch_params.radix_bits >= end_bit),
even_share);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Invert selectors
d_keys.selector ^= 1;
d_values.selector ^= 1;
// Update current bit position
current_bit += radix_bits;
}
#ifndef __CUDA_ARCH__
// Reset smem config if necessary
if (current_smem_config != original_smem_config)
{
if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
}
#endif
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Interface
******************************************************************************/
/**
* \brief Sorts key-value pairs.
*
* \par
* The sorting operation requires a pair of key buffers and a pair of value
* buffers. Each pair is wrapped in a DoubleBuffer structure whose member
* DoubleBuffer::Current() references the active buffer. The currently-active
* buffer may be changed by the sorting operation.
*
* \devicestorage
*
* \cdp
*
* \par
* The code snippet below illustrates the sorting of a device vector of \p int keys
* with associated vector of \p int values.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Create a set of DoubleBuffers to wrap pairs of device pointers for
* // sorting data (keys, values, and equivalently-sized alternate buffers)
* int num_items = ...
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
* cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
*
* // Determine temporary device storage requirements for sorting operation
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
*
* // Allocate temporary storage for sorting operation
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run sorting operation
* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
*
* // Sorted keys and values are referenced by d_keys.Current() and d_values.Current()
*
* \endcode
*
* \tparam Key <b>[inferred]</b> Key type
* \tparam Value <b>[inferred]</b> Value type
*/
template <
typename Key,
typename Value>
__host__ __device__ __forceinline__
static cudaError_t SortPairs(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
DoubleBuffer<Value> &d_values, ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
int num_items, ///< [in] Number of items to reduce
int begin_bit = 0, ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
// Type used for array indexing
typedef int SizeT;
// Tuning polices
typedef PtxDefaultPolicies<Key, Value, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
typedef typename PtxDefaultPolicies::UpsweepPolicy UpsweepPolicy; // Upsweep kernel policy
typedef typename PtxDefaultPolicies::ScanPolicy ScanPolicy; // Scan kernel policy
typedef typename PtxDefaultPolicies::DownsweepPolicy DownsweepPolicy; // Downsweep kernel policy
cudaError error = cudaSuccess;
do
{
// Declare dispatch parameters
KernelDispachParams upsweep_dispatch_params;
KernelDispachParams scan_dispatch_params;
KernelDispachParams downsweep_dispatch_params;
#ifdef __CUDA_ARCH__
// We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
upsweep_dispatch_params.InitUpsweepPolicy<UpsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
scan_dispatch_params.InitScanPolicy<ScanPolicy>();
downsweep_dispatch_params.InitDownsweepPolicy<DownsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
#else
// We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
PtxDefaultPolicies::InitDispatchParams(
ptx_version,
upsweep_dispatch_params,
scan_dispatch_params,
downsweep_dispatch_params);
#endif
// Dispatch
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
RadixSortUpsweepKernel<UpsweepPolicy, Key, SizeT>,
RadixSortScanKernel<ScanPolicy, SizeT>,
RadixSortDownsweepKernel<DownsweepPolicy, Key, Value, SizeT>,
upsweep_dispatch_params,
scan_dispatch_params,
downsweep_dispatch_params,
d_keys,
d_values,
num_items,
begin_bit,
end_bit,
stream,
stream_synchronous))) break;
}
while (0);
return error;
}
/**
* \brief Sorts keys
*
* \par
* The sorting operation requires a pair of key buffers. The pair is
* wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current()
* references the active buffer. The currently-active buffer may be changed
* by the sorting operation.
*
* \devicestorage
*
* \cdp
*
* \par
* The code snippet below illustrates the sorting of a device vector of \p int keys.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Create a set of DoubleBuffers to wrap pairs of device pointers for
* // sorting data (keys and equivalently-sized alternate buffer)
* int num_items = ...
* cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
*
* // Determine temporary device storage requirements for sorting operation
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
*
* // Allocate temporary storage for sorting operation
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run sorting operation
* cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
*
* // Sorted keys are referenced by d_keys.Current()
*
* \endcode
*
* \tparam Key <b>[inferred]</b> Key type
*/
template <typename Key>
__host__ __device__ __forceinline__
static cudaError_t SortKeys(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
DoubleBuffer<Key> &d_keys, ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
int num_items, ///< [in] Number of items to reduce
int begin_bit = 0, ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
int end_bit = sizeof(Key) * 8, ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
DoubleBuffer<NullType> d_values;
return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, stream_synchronous);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,775 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "block/block_reduce_tiles.cuh"
#include "../thread/thread_operators.cuh"
#include "../grid/grid_even_share.cuh"
#include "../grid/grid_queue.cuh"
#include "../util_debug.cuh"
#include "../util_device.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/******************************************************************************
* Kernel entry points
*****************************************************************************/
/**
* Reduction pass kernel entry point (multi-block). Computes privatized reductions, one per thread block.
*/
template <
typename BlockReduceTilesPolicy, ///< Tuning policy for cub::BlockReduceTiles abstraction
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename SizeT, ///< Integer type used for global array indexing
typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
__global__ void ReducePrivatizedKernel(
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
SizeT num_items, ///< [in] Total number of input data items
GridEvenShare<SizeT> even_share, ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
GridQueue<SizeT> queue, ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Thread block type for reducing input tiles
typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
// Block-wide aggregate
T block_aggregate;
// Shared memory storage
__shared__ typename BlockReduceTilesT::TempStorage temp_storage;
// Consume input tiles
BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
num_items,
even_share,
queue,
block_aggregate,
Int2Type<BlockReduceTilesPolicy::GRID_MAPPING>());
// Output result
if (threadIdx.x == 0)
{
d_out[blockIdx.x] = block_aggregate;
}
}
/**
* Reduction pass kernel entry point (single-block). Aggregates privatized threadblock reductions from a previous multi-block reduction pass.
*/
template <
typename BlockReduceTilesPolicy, ///< Tuning policy for cub::BlockReduceTiles abstraction
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename SizeT, ///< Integer type used for global array indexing
typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
__global__ void ReduceSingleKernel(
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
SizeT num_items, ///< [in] Total number of input data items
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Thread block type for reducing input tiles
typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
// Block-wide aggregate
T block_aggregate;
// Shared memory storage
__shared__ typename BlockReduceTilesT::TempStorage temp_storage;
// Consume input tiles
BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
SizeT(0),
SizeT(num_items),
block_aggregate);
// Output result
if (threadIdx.x == 0)
{
d_out[blockIdx.x] = block_aggregate;
}
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* DeviceReduce
*****************************************************************************/
/**
* \brief DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory. ![](reduce_logo.png)
* \ingroup DeviceModule
*
* \par Overview
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
*
* \par Usage Considerations
* \cdp_class{DeviceReduce}
*
* \par Performance
*
* \image html reduction_perf.png
*
*/
struct DeviceReduce
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/******************************************************************************
* Constants and typedefs
******************************************************************************/
/// Generic structure for encapsulating dispatch properties codified in block policy.
struct KernelDispachParams
{
int block_threads;
int items_per_thread;
int vector_load_length;
BlockReduceAlgorithm block_algorithm;
PtxLoadModifier load_modifier;
GridMappingStrategy grid_mapping;
int subscription_factor;
int tile_size;
template <typename BlockPolicy>
__host__ __device__ __forceinline__
void Init(int subscription_factor = 1)
{
block_threads = BlockPolicy::BLOCK_THREADS;
items_per_thread = BlockPolicy::ITEMS_PER_THREAD;
vector_load_length = BlockPolicy::VECTOR_LOAD_LENGTH;
block_algorithm = BlockPolicy::BLOCK_ALGORITHM;
load_modifier = BlockPolicy::LOAD_MODIFIER;
grid_mapping = BlockPolicy::GRID_MAPPING;
this->subscription_factor = subscription_factor;
tile_size = block_threads * items_per_thread;
}
__host__ __device__ __forceinline__
void Print()
{
printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping, %d subscription",
block_threads,
items_per_thread,
vector_load_length,
block_algorithm,
load_modifier,
grid_mapping,
subscription_factor);
}
};
/******************************************************************************
* Tuning policies
******************************************************************************/
/// Specializations of tuned policy types for different PTX architectures
template <
typename T,
typename SizeT,
int ARCH>
struct TunedPolicies;
/// SM35 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 350>
{
// PrivatizedPolicy (1B): GTX Titan: 206.0 GB/s @ 192M 1B items
typedef BlockReduceTilesPolicy<128, 12, 1, BLOCK_REDUCE_RAKING, LOAD_LDG, GRID_MAPPING_DYNAMIC> PrivatizedPolicy1B;
// PrivatizedPolicy (4B): GTX Titan: 254.2 GB/s @ 48M 4B items
typedef BlockReduceTilesPolicy<512, 20, 1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy4B;
// PrivatizedPolicy
typedef typename If<(sizeof(T) < 4),
PrivatizedPolicy1B,
PrivatizedPolicy4B>::Type PrivatizedPolicy;
// SinglePolicy
typedef BlockReduceTilesPolicy<256, 8, 1, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
enum { SUBSCRIPTION_FACTOR = 7 };
};
/// SM30 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 300>
{
// PrivatizedPolicy: GTX670: 154.0 @ 48M 32-bit T
typedef BlockReduceTilesPolicy<256, 2, 1, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy;
// SinglePolicy
typedef BlockReduceTilesPolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
enum { SUBSCRIPTION_FACTOR = 1 };
};
/// SM20 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 200>
{
// PrivatizedPolicy (1B): GTX 580: 158.1 GB/s @ 192M 1B items
typedef BlockReduceTilesPolicy<192, 24, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy1B;
// PrivatizedPolicy (4B): GTX 580: 178.9 GB/s @ 48M 4B items
typedef BlockReduceTilesPolicy<128, 8, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_DYNAMIC> PrivatizedPolicy4B;
// PrivatizedPolicy
typedef typename If<(sizeof(T) < 4),
PrivatizedPolicy1B,
PrivatizedPolicy4B>::Type PrivatizedPolicy;
// SinglePolicy
typedef BlockReduceTilesPolicy<192, 7, 1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
enum { SUBSCRIPTION_FACTOR = 2 };
};
/// SM13 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 130>
{
// PrivatizedPolicy
typedef BlockReduceTilesPolicy<128, 8, 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy;
// SinglePolicy
typedef BlockReduceTilesPolicy<32, 4, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
enum { SUBSCRIPTION_FACTOR = 1 };
};
/// SM10 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 100>
{
// PrivatizedPolicy
typedef BlockReduceTilesPolicy<128, 8, 2, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> PrivatizedPolicy;
// SinglePolicy
typedef BlockReduceTilesPolicy<32, 4, 4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE> SinglePolicy;
enum { SUBSCRIPTION_FACTOR = 1 };
};
/******************************************************************************
* Default policy initializer
******************************************************************************/
/// Tuning policy for the PTX architecture that DeviceReduce operations will get dispatched to
template <typename T, typename SizeT>
struct PtxDefaultPolicies
{
static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
350 :
(CUB_PTX_ARCH >= 300) ?
300 :
(CUB_PTX_ARCH >= 200) ?
200 :
(CUB_PTX_ARCH >= 130) ?
130 :
100;
// Tuned policy set for the current PTX compiler pass
typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
// Subscription factor for the current PTX compiler pass
static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
// PrivatizedPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct PrivatizedPolicy : PtxTunedPolicies::PrivatizedPolicy {};
// SinglePolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct SinglePolicy : PtxTunedPolicies::SinglePolicy {};
/**
* Initialize dispatch params with the policies corresponding to the PTX assembly we will use
*/
static void InitDispatchParams(
int ptx_version,
KernelDispachParams &privatized_dispatch_params,
KernelDispachParams &single_dispatch_params)
{
if (ptx_version >= 350)
{
typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
}
else if (ptx_version >= 300)
{
typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
}
else if (ptx_version >= 200)
{
typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
}
else if (ptx_version >= 130)
{
typedef TunedPolicies<T, SizeT, 130> TunedPolicies;
privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
}
else
{
typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
}
}
};
/******************************************************************************
* Utility methods
******************************************************************************/
/**
* Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
*/
template <
typename ReducePrivatizedKernelPtr, ///< Function type of cub::ReducePrivatizedKernel
typename ReduceSingleKernelPtr, ///< Function type of cub::ReduceSingleKernel
typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename SizeT, ///< Integer type used for global array indexing
typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
ReducePrivatizedKernelPtr privatized_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReducePrivatizedKernel
ReduceSingleKernelPtr single_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceSingleKernel
ResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
KernelDispachParams &privatized_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p privatized_kernel_ptr was compiled for
KernelDispachParams &single_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
SizeT num_items, ///< [in] Number of items to reduce
ReductionOp reduction_op, ///< [in] Binary reduction operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported );
#else
// Data type of input iterator
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
cudaError error = cudaSuccess;
do
{
if ((privatized_kernel == NULL) || (num_items <= (single_dispatch_params.tile_size)))
{
// Dispatch a single-block reduction kernel
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
{
temp_storage_bytes = 1;
return cudaSuccess;
}
// Log single_kernel configuration
if (stream_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
// Invoke single_kernel
single_kernel<<<1, single_dispatch_params.block_threads>>>(
d_in,
d_out,
num_items,
reduction_op);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
else
{
// Dispatch two kernels: a multi-block kernel to compute
// privatized per-block reductions, and then a single-block
// to reduce those
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Get a rough estimate of privatized_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
int privatized_sm_occupancy = CUB_MIN(
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / privatized_dispatch_params.block_threads);
#ifndef __CUDA_ARCH__
// We're on the host, so come up with a more accurate estimate of privatized_kernel SM occupancy from actual device properties
Device device_props;
if (CubDebug(error = device_props.Init(device_ordinal))) break;
if (CubDebug(error = device_props.MaxSmOccupancy(
privatized_sm_occupancy,
privatized_kernel,
privatized_dispatch_params.block_threads))) break;
#endif
// Get device occupancy for privatized_kernel
int privatized_occupancy = privatized_sm_occupancy * sm_count;
// Even-share work distribution
GridEvenShare<SizeT> even_share;
// Get grid size for privatized_kernel
int privatized_grid_size;
switch (privatized_dispatch_params.grid_mapping)
{
case GRID_MAPPING_EVEN_SHARE:
// Work is distributed evenly
even_share.GridInit(
num_items,
privatized_occupancy * privatized_dispatch_params.subscription_factor,
privatized_dispatch_params.tile_size);
privatized_grid_size = even_share.grid_size;
break;
case GRID_MAPPING_DYNAMIC:
// Work is distributed dynamically
int num_tiles = (num_items + privatized_dispatch_params.tile_size - 1) / privatized_dispatch_params.tile_size;
privatized_grid_size = (num_tiles < privatized_occupancy) ?
num_tiles : // Not enough to fill the device with threadblocks
privatized_occupancy; // Fill the device with threadblocks
break;
};
// Temporary storage allocation requirements
void* allocations[2];
size_t allocation_sizes[2] =
{
privatized_grid_size * sizeof(T), // bytes needed for privatized block reductions
GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
};
// Alias temporaries (or set the necessary size of the storage allocation)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
return cudaSuccess;
// Privatized per-block reductions
T *d_block_reductions = (T*) allocations[0];
// Grid queue descriptor
GridQueue<SizeT> queue(allocations[1]);
// Prepare the dynamic queue descriptor if necessary
if (privatized_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC)
{
// Prepare queue using a kernel so we know it gets prepared once per operation
if (stream_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
// Invoke prepare_drain_kernel
prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
// Log privatized_kernel configuration
if (stream_synchronous) CubLog("Invoking privatized_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
privatized_grid_size, privatized_dispatch_params.block_threads, (long long) stream, privatized_dispatch_params.items_per_thread, privatized_sm_occupancy);
// Invoke privatized_kernel
privatized_kernel<<<privatized_grid_size, privatized_dispatch_params.block_threads, 0, stream>>>(
d_in,
d_block_reductions,
num_items,
even_share,
queue,
reduction_op);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Log single_kernel configuration
if (stream_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
1, single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
// Invoke single_kernel
single_kernel<<<1, single_dispatch_params.block_threads, 0, stream>>>(
d_block_reductions,
d_out,
privatized_grid_size,
reduction_op);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Interface
******************************************************************************/
/**
* \brief Computes a device-wide reduction using the specified binary \p reduction_op functor.
*
* \par
* Does not support non-commutative reduction operators.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the max reduction of a device vector of \p int items.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input and output
* int *d_reduce_input, *d_aggregate;
* int num_items = ...
* ...
*
* // Determine temporary device storage requirements for reduction
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
*
* // Allocate temporary storage for reduction
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run reduction (max)
* cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
*
* \endcode
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename ReductionOp>
__host__ __device__ __forceinline__
static cudaError_t Reduce(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
int num_items, ///< [in] Number of items to reduce
ReductionOp reduction_op, ///< [in] Binary reduction operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
// Type used for array indexing
typedef int SizeT;
// Data type of input iterator
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Tuning polices
typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
typedef typename PtxDefaultPolicies::PrivatizedPolicy PrivatizedPolicy; // Multi-block kernel policy
typedef typename PtxDefaultPolicies::SinglePolicy SinglePolicy; // Single-block kernel policy
cudaError error = cudaSuccess;
do
{
// Declare dispatch parameters
KernelDispachParams privatized_dispatch_params;
KernelDispachParams single_dispatch_params;
#ifdef __CUDA_ARCH__
// We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
privatized_dispatch_params.Init<PrivatizedPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
single_dispatch_params.Init<SinglePolicy>();
#else
// We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
PtxDefaultPolicies::InitDispatchParams(ptx_version, privatized_dispatch_params, single_dispatch_params);
#endif
// Dispatch
if (CubDebug(error = Dispatch(
d_temp_storage,
temp_storage_bytes,
ReducePrivatizedKernel<PrivatizedPolicy, InputIteratorRA, T*, SizeT, ReductionOp>,
ReduceSingleKernel<SinglePolicy, T*, OutputIteratorRA, SizeT, ReductionOp>,
ResetDrainKernel<SizeT>,
privatized_dispatch_params,
single_dispatch_params,
d_in,
d_out,
num_items,
reduction_op,
stream,
stream_synchronous))) break;
}
while (0);
return error;
}
/**
* \brief Computes a device-wide sum using the addition ('+') operator.
*
* \par
* Does not support non-commutative reduction operators.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the sum reduction of a device vector of \p int items.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input and output
* int *d_reduce_input, *d_aggregate;
* int num_items = ...
* ...
*
* // Determine temporary device storage requirements for summation
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
*
* // Allocate temporary storage for summation
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run reduction summation
* cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
*
* \endcode
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t Sum(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
int num_items, ///< [in] Number of items to reduce
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
return Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), stream, stream_synchronous);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,633 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "block/block_reduce_by_key_tiles.cuh"
#include "device_scan.cuh"
#include "../thread/thread_operators.cuh"
#include "../grid/grid_queue.cuh"
#include "../util_iterator.cuh"
#include "../util_debug.cuh"
#include "../util_device.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Kernel entry points
*****************************************************************************/
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Reduce-by-key kernel entry point (multi-block)
*/
template <
typename BlockReduceByKeyilesPolicy, ///< Tuning policy for cub::BlockReduceByKeyiles abstraction
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename T, ///< The scan data type
typename ReductionOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__launch_bounds__ (int(BlockSweepScanPolicy::BLOCK_THREADS))
__global__ void MultiBlockScanKernel(
InputIteratorRA d_in, ///< Input data
OutputIteratorRA d_out, ///< Output data
ScanTileDescriptor<T> *d_tile_status, ///< Global list of tile status
ReductionOp reduction_op, ///< Binary scan operator
Identity identity, ///< Identity element
SizeT num_items, ///< Total number of scan items for the entire problem
GridQueue<int> queue) ///< Descriptor for performing dynamic mapping of tile data to thread blocks
{
enum
{
TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
};
// Thread block type for scanning input tiles
typedef BlockSweepScan<
BlockSweepScanPolicy,
InputIteratorRA,
OutputIteratorRA,
ReductionOp,
Identity,
SizeT> BlockSweepScanT;
// Shared memory for BlockSweepScan
__shared__ typename BlockSweepScanT::TempStorage temp_storage;
// Process tiles
BlockSweepScanT(temp_storage, d_in, d_out, reduction_op, identity).ConsumeTiles(
num_items,
queue,
d_tile_status + TILE_STATUS_PADDING);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* DeviceReduceByKey
*****************************************************************************/
/**
* \addtogroup DeviceModule
* @{
*/
/**
* \brief DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](scan_logo.png)
*/
struct DeviceReduceByKey
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/******************************************************************************
* Constants and typedefs
******************************************************************************/
/// Generic structure for encapsulating dispatch properties. Mirrors the constants within BlockSweepScanPolicy.
struct KernelDispachParams
{
// Policy fields
int block_threads;
int items_per_thread;
BlockLoadAlgorithm load_policy;
BlockStoreAlgorithm store_policy;
BlockScanAlgorithm scan_algorithm;
// Other misc
int tile_size;
template <typename BlockSweepScanPolicy>
__host__ __device__ __forceinline__
void Init()
{
block_threads = BlockSweepScanPolicy::BLOCK_THREADS;
items_per_thread = BlockSweepScanPolicy::ITEMS_PER_THREAD;
load_policy = BlockSweepScanPolicy::LOAD_ALGORITHM;
store_policy = BlockSweepScanPolicy::STORE_ALGORITHM;
scan_algorithm = BlockSweepScanPolicy::SCAN_ALGORITHM;
tile_size = block_threads * items_per_thread;
}
__host__ __device__ __forceinline__
void Print()
{
printf("%d, %d, %d, %d, %d",
block_threads,
items_per_thread,
load_policy,
store_policy,
scan_algorithm);
}
};
/******************************************************************************
* Tuning policies
******************************************************************************/
/// Specializations of tuned policy types for different PTX architectures
template <
typename T,
typename SizeT,
int ARCH>
struct TunedPolicies;
/// SM35 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 350>
{
typedef BlockSweepScanPolicy<128, 16, BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
};
/// SM30 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 300>
{
typedef BlockSweepScanPolicy<256, 9, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
};
/// SM20 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 200>
{
typedef BlockSweepScanPolicy<128, 15, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
};
/// SM10 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 100>
{
typedef BlockSweepScanPolicy<128, 7, BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> MultiBlockPolicy;
};
/// Tuning policy for the PTX architecture that DeviceReduceByKey operations will get dispatched to
template <typename T, typename SizeT>
struct PtxDefaultPolicies
{
static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
350 :
(CUB_PTX_ARCH >= 300) ?
300 :
(CUB_PTX_ARCH >= 200) ?
200 :
100;
// Tuned policy set for the current PTX compiler pass
typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
// MultiBlockPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct MultiBlockPolicy : PtxTunedPolicies::MultiBlockPolicy {};
/**
* Initialize dispatch params with the policies corresponding to the PTX assembly we will use
*/
static void InitDispatchParams(int ptx_version, KernelDispachParams &multi_block_dispatch_params)
{
if (ptx_version >= 350)
{
typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
}
else if (ptx_version >= 300)
{
typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
}
else if (ptx_version >= 200)
{
typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
}
else
{
typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
}
}
};
/******************************************************************************
* Utility methods
******************************************************************************/
/**
* Internal dispatch routine
*/
template <
typename InitScanKernelPtr, ///< Function type of cub::InitScanKernel
typename MultiBlockScanKernelPtr, ///< Function type of cub::MultiBlockScanKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ReductionOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InitScanKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::InitScanKernel
MultiBlockScanKernelPtr multi_block_kernel, ///< [in] Kernel function pointer to parameterization of cub::MultiBlockScanKernel
KernelDispachParams &multi_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ReductionOp reduction_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported );
#else
enum
{
TILE_STATUS_PADDING = 32,
};
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
cudaError error = cudaSuccess;
do
{
// Number of input tiles
int num_tiles = (num_items + multi_block_dispatch_params.tile_size - 1) / multi_block_dispatch_params.tile_size;
// Temporary storage allocation requirements
void* allocations[2];
size_t allocation_sizes[2] =
{
(num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptor<T>), // bytes needed for tile status descriptors
GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
};
// Alias temporaries (or set the necessary size of the storage allocation)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
return cudaSuccess;
// Global list of tile status
ScanTileDescriptor<T> *d_tile_status = (ScanTileDescriptor<T>*) allocations[0];
// Grid queue descriptor
GridQueue<int> queue(allocations[1]);
// Get GPU id
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Log init_kernel configuration
int init_kernel_threads = 128;
int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
// Invoke init_kernel to initialize tile descriptors and queue descriptors
init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
queue,
d_tile_status,
num_tiles);
// Sync the stream if specified
#ifndef __CUDA_ARCH__
if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
#else
if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
#endif
// Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
int multi_sm_occupancy = CUB_MIN(
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / multi_block_dispatch_params.block_threads);
#ifndef __CUDA_ARCH__
// We're on the host, so come up with a more accurate estimate of multi_block_kernel SM occupancy from actual device properties
Device device_props;
if (CubDebug(error = device_props.Init(device_ordinal))) break;
if (CubDebug(error = device_props.MaxSmOccupancy(
multi_sm_occupancy,
multi_block_kernel,
multi_block_dispatch_params.block_threads))) break;
#endif
// Get device occupancy for multi_block_kernel
int multi_block_occupancy = multi_sm_occupancy * sm_count;
// Get grid size for multi_block_kernel
int multi_block_grid_size = (num_tiles < multi_block_occupancy) ?
num_tiles : // Not enough to fill the device with threadblocks
multi_block_occupancy; // Fill the device with threadblocks
// Log multi_block_kernel configuration
if (stream_synchronous) CubLog("Invoking multi_block_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
multi_block_grid_size, multi_block_dispatch_params.block_threads, (long long) stream, multi_block_dispatch_params.items_per_thread, multi_sm_occupancy);
// Invoke multi_block_kernel
multi_block_kernel<<<multi_block_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
d_in,
d_out,
d_tile_status,
reduction_op,
identity,
num_items,
queue);
// Sync the stream if specified
#ifndef __CUDA_ARCH__
if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
#else
if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
#endif
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* Internal scan dispatch routine for using default tuning policies
*/
template <
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ReductionOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ReductionOp reduction_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Tuning polices for the PTX architecture that will get dispatched to
typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies;
typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
cudaError error = cudaSuccess;
do
{
// Declare dispatch parameters
KernelDispachParams multi_block_dispatch_params;
#ifdef __CUDA_ARCH__
// We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
multi_block_dispatch_params.Init<MultiBlockPolicy>();
#else
// We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
int ptx_version;
if (CubDebug(error = PtxVersion(ptx_version))) break;
PtxDefaultPolicies::InitDispatchParams(ptx_version, multi_block_dispatch_params);
#endif
Dispatch(
d_temp_storage,
temp_storage_bytes,
InitScanKernel<T, SizeT>,
MultiBlockScanKernel<MultiBlockPolicy, InputIteratorRA, OutputIteratorRA, T, ReductionOp, Identity, SizeT>,
multi_block_dispatch_params,
d_in,
d_out,
reduction_op,
identity,
num_items,
stream,
stream_synchronous);
if (CubDebug(error)) break;
}
while (0);
return error;
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************//**
* Interface
*********************************************************************/
/**
* \brief Computes device-wide reductions of consecutive values whose corresponding keys are equal.
*
* The resulting output lists of value-aggregates and their corresponding keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam ValueInputIteratorRA <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
* \tparam ValueOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename ValueInputIteratorRA,
typename ValueOutputIteratorRA,
typename ReductionOp>
__host__ __device__ __forceinline__
static cudaError_t ReduceValues(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [out] Key output data (compacted)
ValueInputIteratorRA d_values_in, ///< [in] Value input data
ValueOutputIteratorRA d_values_out, ///< [out] Value output data (compacted)
int num_items, ///< [in] Total number of input pairs
ReductionOp reduction_op, ///< [in] Binary value reduction operator
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
}
/**
* \brief Computes device-wide sums of consecutive values whose corresponding keys are equal.
*
* The resulting output lists of value-aggregates and their corresponding keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam ValueInputIteratorRA <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
* \tparam ValueOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename ValueInputIteratorRA,
typename ValueOutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t SumValues(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [in] Key output data (compacted)
ValueInputIteratorRA d_values_in, ///< [in] Value input data
ValueOutputIteratorRA d_values_out, ///< [in] Value output data (compacted)
int num_items, ///< [in] Total number of input pairs
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return ReduceValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, cub::Sum(), num_items, stream, stream_synchronous);
}
/**
* \brief Computes the "run-length" of each group of consecutive, equal-valued keys.
*
* The resulting output lists of run-length counts and their corresponding keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam CountOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for output of key-counts whose value type must be convertible to an integer type (may be a simple pointer type)
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename CountOutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t RunLengths(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [in] Key output data (compacted)
CountOutputIteratorRA d_counts_out, ///< [in] Run-length counts output data (compacted)
int num_items, ///< [in] Total number of keys
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef typename std::iterator_traits<CountOutputIteratorRA>::value_type CountT;
return SumValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, ConstantIteratorRA<CountT>(1), d_counts_out, num_items, stream, stream_synchronous);
}
/**
* \brief Removes duplicates within each group of consecutive, equal-valued keys. Only the first key from each group (and corresponding value) is kept.
*
* The resulting keys are compacted.
*
* \devicestorage
*
* \tparam KeyInputIteratorRA <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
* \tparam KeyOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
* \tparam ValueInputIteratorRA <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
* \tparam ValueOutputIteratorRA <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
*/
template <
typename KeyInputIteratorRA,
typename KeyOutputIteratorRA,
typename ValueInputIteratorRA,
typename ValueOutputIteratorRA,
typename ReductionOp>
__host__ __device__ __forceinline__
static cudaError_t Unique(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
KeyInputIteratorRA d_keys_in, ///< [in] Key input data
KeyOutputIteratorRA d_keys_out, ///< [out] Key output data (compacted)
ValueInputIteratorRA d_values_in, ///< [in] Value input data
ValueOutputIteratorRA d_values_out, ///< [out] Value output data (compacted)
int num_items, ///< [in] Total number of input pairs
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
}
};
/** @} */ // DeviceModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,550 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "device_scan.cuh"
#include "block/block_partition_tiles.cuh"
#include "../grid/grid_queue.cuh"
#include "../util_debug.cuh"
#include "../util_device.cuh"
#include "../util_vector.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Kernel entry points
*****************************************************************************/
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Partition kernel entry point (multi-block)
*/
template <
typename BlockPartitionTilesPolicy, ///< Tuning policy for cub::BlockPartitionTiles abstraction
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename LengthOutputIterator, ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
typename PredicateOp, ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
typename SizeT> ///< Integer type used for global array indexing
__launch_bounds__ (int(BlockPartitionTilesPolicy::BLOCK_THREADS))
__global__ void PartitionKernel(
InputIteratorRA d_in, ///< Input data
OutputIteratorRA d_out, ///< Output data
LengthOutputIterator d_partition_length, ///< Number of items in the first partition
ScanTileDescriptor<PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> > *d_tile_status, ///< Global list of tile status
PredicateOp pred_op, ///< Unary predicate operator indicating membership in the first partition
SizeT num_items, ///< Total number of input items for the entire problem
int num_tiles, ///< Totla number of intut tiles for the entire problem
GridQueue<int> queue) ///< Descriptor for performing dynamic mapping of tile data to thread blocks
{
enum
{
TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
};
typedef PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> PartitionScanTuple;
// Thread block type for scanning input tiles
typedef BlockPartitionTiles<
BlockPartitionTilesPolicy,
InputIteratorRA,
OutputIteratorRA,
PredicateOp,
SizeT> BlockPartitionTilesT;
// Shared memory for BlockPartitionTiles
__shared__ typename BlockPartitionTilesT::TempStorage temp_storage;
// Process tiles
PartitionScanTuple partition_ends; // Ending offsets for partitions (one-after)
bool is_last_tile; // Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
BlockPartitionTilesT(temp_storage, d_in, d_out, d_tile_status + TILE_STATUS_PADDING, pred_op, num_items).ConsumeTiles(
queue,
num_tiles,
partition_ends,
is_last_tile);
// Record the length of the first partition
if (is_last_tile && (threadIdx.x == 0))
{
*d_partition_length = partition_ends.x;
}
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* DeviceReorder
*****************************************************************************/
/**
* \addtogroup DeviceModule
* @{
*/
/**
* \brief DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory
*/
struct DeviceReorder
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/******************************************************************************
* Constants and typedefs
******************************************************************************/
/// Generic structure for encapsulating dispatch properties. Mirrors the constants within BlockPartitionTilesPolicy.
struct KernelDispachParams
{
int block_threads;
int items_per_thread;
BlockScanAlgorithm scan_algorithm;
int tile_size;
template <typename BlockPartitionTilesPolicy>
__host__ __device__ __forceinline__
void Init()
{
block_threads = BlockPartitionTilesPolicy::BLOCK_THREADS;
items_per_thread = BlockPartitionTilesPolicy::ITEMS_PER_THREAD;
scan_algorithm = BlockPartitionTilesPolicy::SCAN_ALGORITHM;
tile_size = block_threads * items_per_thread;
}
};
/******************************************************************************
* Tuning policies
******************************************************************************/
/// Specializations of tuned policy types for different PTX architectures
template <
int PARTITIONS,
typename T,
typename SizeT,
int ARCH>
struct TunedPolicies;
/// SM35 tune
template <int PARTITIONS, typename T, typename SizeT>
struct TunedPolicies<PARTITIONS, T, SizeT, 350>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 16,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
};
/// SM30 tune
template <int PARTITIONS, typename T, typename SizeT>
struct TunedPolicies<PARTITIONS, T, SizeT, 300>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 9,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
typedef BlockPartitionTilesPolicy<PARTITIONS, 256, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
};
/// SM20 tune
template <int PARTITIONS, typename T, typename SizeT>
struct TunedPolicies<PARTITIONS, T, SizeT, 200>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 15,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
};
/// SM10 tune
template <int PARTITIONS, typename T, typename SizeT>
struct TunedPolicies<PARTITIONS, T, SizeT, 100>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 7,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING> PartitionPolicy;
};
/// Tuning policy for the PTX architecture that DevicePartition operations will get dispatched to
template <int PARTITIONS, typename T, typename SizeT>
struct PtxDefaultPolicies
{
static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
350 :
(CUB_PTX_ARCH >= 300) ?
300 :
(CUB_PTX_ARCH >= 200) ?
200 :
100;
// Tuned policy set for the current PTX compiler pass
typedef TunedPolicies<PARTITIONS, T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
// PartitionPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct PartitionPolicy : PtxTunedPolicies::PartitionPolicy {};
/**
* Initialize dispatch params with the policies corresponding to the PTX assembly we will use
*/
static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
{
if (ptx_version >= 350)
{
typedef TunedPolicies<PARTITIONS, T, SizeT, 350> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
}
else if (ptx_version >= 300)
{
typedef TunedPolicies<PARTITIONS, T, SizeT, 300> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
}
else if (ptx_version >= 200)
{
typedef TunedPolicies<PARTITIONS, T, SizeT, 200> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
}
else
{
typedef TunedPolicies<PARTITIONS, T, SizeT, 100> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
}
}
};
/******************************************************************************
* Utility methods
******************************************************************************/
/**
* Internal dispatch routine
*/
template <
typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel
typename PartitionKernelPtr, ///< Function type of cub::PartitionKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename LengthOutputIterator, ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
typename PredicateOp, ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
int ptx_version, ///< [in] PTX version
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::PartitionInitKernel
PartitionKernelPtr partition_kernel, ///< [in] Kernel function pointer to parameterization of cub::PartitionKernel
KernelDispachParams &scan_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p partition_kernel was compiled for
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
LengthOutputIterator d_partition_length, ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
PredicateOp pred_op, ///< [in] Unary predicate operator indicating membership in the first partition
SizeT num_items, ///< [in] Total number of items to partition
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
enum
{
TILE_STATUS_PADDING = 32,
};
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Scan tuple type and tile status descriptor type
typedef typename VectorHelper<SizeT, 2>::Type ScanTuple;
typedef ScanTileDescriptor<ScanTuple> ScanTileDescriptorT;
cudaError error = cudaSuccess;
do
{
// Number of input tiles
int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
// Temporary storage allocation requirements
void* allocations[2];
size_t allocation_sizes[2] =
{
(num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT), // bytes needed for tile status descriptors
GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
};
// Alias temporaries (or set the necessary size of the storage allocation)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
return cudaSuccess;
// Global list of tile status
ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
// Grid queue descriptor
GridQueue<int> queue(allocations[1]);
// Log init_kernel configuration
int init_kernel_threads = 128;
int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
// Invoke init_kernel to initialize tile descriptors and queue descriptors
init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
queue,
d_tile_status,
num_tiles);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Get grid size for multi-block kernel
int scan_grid_size;
int multi_sm_occupancy = -1;
if (ptx_version < 200)
{
// We don't have atomics (or don't have fast ones), so just assign one
// block per tile (limited to 65K tiles)
scan_grid_size = num_tiles;
}
else
{
// We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
// Get GPU id
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Get a rough estimate of partition_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
multi_sm_occupancy = CUB_MIN(
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
#ifndef __CUDA_ARCH__
// We're on the host, so come up with a
Device device_props;
if (CubDebug(error = device_props.Init(device_ordinal))) break;
if (CubDebug(error = device_props.MaxSmOccupancy(
multi_sm_occupancy,
partition_kernel,
scan_dispatch_params.block_threads))) break;
#endif
// Get device occupancy for partition_kernel
int scan_occupancy = multi_sm_occupancy * sm_count;
// Get grid size for partition_kernel
scan_grid_size = (num_tiles < scan_occupancy) ?
num_tiles : // Not enough to fill the device with threadblocks
scan_occupancy; // Fill the device with threadblocks
}
// Log partition_kernel configuration
if (stream_synchronous) CubLog("Invoking partition_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
// Invoke partition_kernel
partition_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
d_in,
d_out,
d_partition_length,
d_tile_status,
pred_op,
num_items,
num_tiles,
queue);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* Internal partition dispatch routine for using default tuning policies
*/
template <
typename PARTITIONS, ///< Number of partitions we are keeping
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename LengthOutputIterator, ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
typename PredicateOp, ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to input items
OutputIteratorRA d_out, ///< [in] Iterator pointing to output items
LengthOutputIterator d_partition_length, ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
PredicateOp pred_op, ///< [in] Unary predicate operator indicating membership in the first partition
SizeT num_items, ///< [in] Total number of items to partition
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Tuning polices
typedef PtxDefaultPolicies<PARTITIONS, T, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
typedef typename PtxDefaultPolicies::PartitionPolicy PartitionPolicy; // Partition kernel policy
cudaError error = cudaSuccess;
do
{
// Declare dispatch parameters
KernelDispachParams scan_dispatch_params;
int ptx_version;
#ifdef __CUDA_ARCH__
// We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
scan_dispatch_params.Init<PartitionPolicy>();
ptx_version = CUB_PTX_ARCH;
#else
// We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
if (CubDebug(error = PtxVersion(ptx_version))) break;
PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
#endif
Dispatch(
ptx_version,
d_temp_storage,
temp_storage_bytes,
ScanInitKernel<T, SizeT>,
PartitionKernel<PartitionPolicy, InputIteratorRA, OutputIteratorRA, LengthOutputIterator, PredicateOp, SizeT>,
scan_dispatch_params,
d_in,
d_out,
d_partition_length,
pred_op,
num_items,
stream,
stream_synchronous);
if (CubDebug(error)) break;
}
while (0);
return error;
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Splits a list of input items into two partitions within the given output list using the specified predicate. The relative ordering of inputs is not necessarily preserved.
*
* An item \p val is placed in the first partition if <tt>pred_op(val) == true</tt>, otherwise
* it is placed in the second partition. The offset of the partitioning pivot (equivalent to
* the total length of the first partition as well as the starting offset of the second), is
* recorded to \p d_partition_length.
*
* The length of the output referenced by \p d_out is assumed to be the same as that of \p d_in.
*
* \devicestorage
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam LengthOutputIterator <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam PredicateOp <b>[inferred]</b> Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename LengthOutputIterator,
typename PredicateOp>
__host__ __device__ __forceinline__
static cudaError_t Partition(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to input items
OutputIteratorRA d_out, ///< [in] Iterator pointing to output items
LengthOutputIterator d_pivot_offset, ///< [out] Output iterator referencing the location where the pivot offset is to be recorded
PredicateOp pred_op, ///< [in] Unary predicate operator indicating membership in the first partition
int num_items, ///< [in] Total number of items to partition
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
}
};
/** @} */ // DeviceModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,812 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
*/
#pragma once
#include <stdio.h>
#include <iterator>
#include "block/block_scan_tiles.cuh"
#include "../thread/thread_operators.cuh"
#include "../grid/grid_queue.cuh"
#include "../util_debug.cuh"
#include "../util_device.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Kernel entry points
*****************************************************************************/
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Initialization kernel for tile status initialization (multi-block)
*/
template <
typename T, ///< Scan value type
typename SizeT> ///< Integer type used for global array indexing
__global__ void ScanInitKernel(
GridQueue<SizeT> grid_queue, ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks
ScanTileDescriptor<T> *d_tile_status, ///< [out] Tile status words
int num_tiles) ///< [in] Number of tiles
{
typedef ScanTileDescriptor<T> ScanTileDescriptorT;
enum
{
TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
};
// Reset queue descriptor
if ((blockIdx.x == 0) && (threadIdx.x == 0)) grid_queue.ResetDrain(num_tiles);
// Initialize tile status
int tile_offset = (blockIdx.x * blockDim.x) + threadIdx.x;
if (tile_offset < num_tiles)
{
// Not-yet-set
d_tile_status[TILE_STATUS_PADDING + tile_offset].status = SCAN_TILE_INVALID;
}
if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
{
// Padding
d_tile_status[threadIdx.x].status = SCAN_TILE_OOB;
}
}
/**
* Scan kernel entry point (multi-block)
*/
template <
typename BlockScanTilesPolicy, ///< Tuning policy for cub::BlockScanTiles abstraction
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename T, ///< The scan data type
typename ScanOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS))
__global__ void ScanKernel(
InputIteratorRA d_in, ///< Input data
OutputIteratorRA d_out, ///< Output data
ScanTileDescriptor<T> *d_tile_status, ///< Global list of tile status
ScanOp scan_op, ///< Binary scan operator
Identity identity, ///< Identity element
SizeT num_items, ///< Total number of scan items for the entire problem
GridQueue<int> queue) ///< Descriptor for performing dynamic mapping of tile data to thread blocks
{
enum
{
TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
};
// Thread block type for scanning input tiles
typedef BlockScanTiles<
BlockScanTilesPolicy,
InputIteratorRA,
OutputIteratorRA,
ScanOp,
Identity,
SizeT> BlockScanTilesT;
// Shared memory for BlockScanTiles
__shared__ typename BlockScanTilesT::TempStorage temp_storage;
// Process tiles
BlockScanTilesT(temp_storage, d_in, d_out, scan_op, identity).ConsumeTiles(
num_items,
queue,
d_tile_status + TILE_STATUS_PADDING);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* DeviceScan
*****************************************************************************/
/**
* \brief DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](device_scan.png)
* \ingroup DeviceModule
*
* \par Overview
* Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
* produces an output list where each element is computed to be the reduction
* of the elements occurring earlier in the input list. <em>Prefix sum</em>
* connotes a prefix scan with the addition operator. The term \em inclusive indicates
* that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
* The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
* the <em>i</em><sup>th</sup> output reduction.
*
* \par Usage Considerations
* \cdp_class{DeviceScan}
*
* \par Performance
*
* \image html scan_perf.png
*
*/
struct DeviceScan
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/******************************************************************************
* Constants and typedefs
******************************************************************************/
/// Generic structure for encapsulating dispatch properties. Mirrors the constants within BlockScanTilesPolicy.
struct KernelDispachParams
{
// Policy fields
int block_threads;
int items_per_thread;
BlockLoadAlgorithm load_policy;
BlockStoreAlgorithm store_policy;
BlockScanAlgorithm scan_algorithm;
// Other misc
int tile_size;
template <typename BlockScanTilesPolicy>
__host__ __device__ __forceinline__
void Init()
{
block_threads = BlockScanTilesPolicy::BLOCK_THREADS;
items_per_thread = BlockScanTilesPolicy::ITEMS_PER_THREAD;
load_policy = BlockScanTilesPolicy::LOAD_ALGORITHM;
store_policy = BlockScanTilesPolicy::STORE_ALGORITHM;
scan_algorithm = BlockScanTilesPolicy::SCAN_ALGORITHM;
tile_size = block_threads * items_per_thread;
}
__host__ __device__ __forceinline__
void Print()
{
printf("%d, %d, %d, %d, %d",
block_threads,
items_per_thread,
load_policy,
store_policy,
scan_algorithm);
}
};
/******************************************************************************
* Tuning policies
******************************************************************************/
/// Specializations of tuned policy types for different PTX architectures
template <
typename T,
typename SizeT,
int ARCH>
struct TunedPolicies;
/// SM35 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 350>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 16,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
// ScanPolicy: GTX Titan: 29.1B items/s (232.4 GB/s) @ 48M 32-bit T
typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
};
/// SM30 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 300>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 9,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
typedef BlockScanTilesPolicy<256, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
};
/// SM20 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 200>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 15,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
// ScanPolicy: GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
};
/// SM10 tune
template <typename T, typename SizeT>
struct TunedPolicies<T, SizeT, 100>
{
enum {
NOMINAL_4B_ITEMS_PER_THREAD = 7,
ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};
typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> ScanPolicy;
};
/// Tuning policy for the PTX architecture that DeviceScan operations will get dispatched to
template <typename T, typename SizeT>
struct PtxDefaultPolicies
{
static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
350 :
(CUB_PTX_ARCH >= 300) ?
300 :
(CUB_PTX_ARCH >= 200) ?
200 :
100;
// Tuned policy set for the current PTX compiler pass
typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
// ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
/**
* Initialize dispatch params with the policies corresponding to the PTX assembly we will use
*/
static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
{
if (ptx_version >= 350)
{
typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
}
else if (ptx_version >= 300)
{
typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
}
else if (ptx_version >= 200)
{
typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
}
else
{
typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
}
}
};
/******************************************************************************
* Utility methods
******************************************************************************/
/**
* Internal dispatch routine
*/
template <
typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel
typename ScanKernelPtr, ///< Function type of cub::ScanKernel
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ScanOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
int ptx_version, ///< [in] PTX version
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanKernel
KernelDispachParams &scan_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
#ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);
#else
enum
{
TILE_STATUS_PADDING = 32,
INIT_KERNEL_THREADS = 128
};
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Tile status descriptor type
typedef ScanTileDescriptor<T> ScanTileDescriptorT;
cudaError error = cudaSuccess;
do
{
// Number of input tiles
int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
// Temporary storage allocation requirements
void* allocations[2];
size_t allocation_sizes[2] =
{
(num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT), // bytes needed for tile status descriptors
GridQueue<int>::AllocationSize() // bytes needed for grid queue descriptor
};
// Alias temporaries (or set the necessary size of the storage allocation)
if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == NULL)
return cudaSuccess;
// Global list of tile status
ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
// Grid queue descriptor
GridQueue<int> queue(allocations[1]);
// Log init_kernel configuration
int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
// Invoke init_kernel to initialize tile descriptors and queue descriptors
init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
queue,
d_tile_status,
num_tiles);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
// Get grid size for multi-block kernel
int scan_grid_size;
int multi_sm_occupancy = -1;
if (ptx_version < 200)
{
// We don't have atomics (or don't have fast ones), so just assign one
// block per tile (limited to 65K tiles)
scan_grid_size = num_tiles;
}
else
{
// We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
// Get GPU id
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Get a rough estimate of scan_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
multi_sm_occupancy = CUB_MIN(
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
#ifndef __CUDA_ARCH__
// We're on the host, so come up with a
Device device_props;
if (CubDebug(error = device_props.Init(device_ordinal))) break;
if (CubDebug(error = device_props.MaxSmOccupancy(
multi_sm_occupancy,
scan_kernel,
scan_dispatch_params.block_threads))) break;
#endif
// Get device occupancy for scan_kernel
int scan_occupancy = multi_sm_occupancy * sm_count;
// Get grid size for scan_kernel
scan_grid_size = (num_tiles < scan_occupancy) ?
num_tiles : // Not enough to fill the device with threadblocks
scan_occupancy; // Fill the device with threadblocks
}
// Log scan_kernel configuration
if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
// Invoke scan_kernel
scan_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
d_in,
d_out,
d_tile_status,
scan_op,
identity,
num_items,
queue);
// Sync the stream if specified
if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
}
while (0);
return error;
#endif // CUB_RUNTIME_ENABLED
}
/**
* Internal scan dispatch routine for using default tuning policies
*/
template <
typename InputIteratorRA, ///< Random-access iterator type for input (may be a simple pointer type)
typename OutputIteratorRA, ///< Random-access iterator type for output (may be a simple pointer type)
typename ScanOp, ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
typename Identity, ///< Identity value type (cub::NullType for inclusive scans)
typename SizeT> ///< Integer type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
SizeT num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
// Data type
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
// Tuning polices
typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies; // Wrapper of default kernel policies
typedef typename PtxDefaultPolicies::ScanPolicy ScanPolicy; // Scan kernel policy
cudaError error = cudaSuccess;
do
{
// Declare dispatch parameters
KernelDispachParams scan_dispatch_params;
int ptx_version;
#ifdef __CUDA_ARCH__
// We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
scan_dispatch_params.Init<ScanPolicy>();
ptx_version = CUB_PTX_ARCH;
#else
// We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
if (CubDebug(error = PtxVersion(ptx_version))) break;
PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
#endif
Dispatch(
ptx_version,
d_temp_storage,
temp_storage_bytes,
ScanInitKernel<T, SizeT>,
ScanKernel<ScanPolicy, InputIteratorRA, OutputIteratorRA, T, ScanOp, Identity, SizeT>,
scan_dispatch_params,
d_in,
d_out,
scan_op,
identity,
num_items,
stream,
stream_synchronous);
if (CubDebug(error)) break;
}
while (0);
return error;
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************//**
* \name Exclusive scans
*********************************************************************/
//@{
/**
* \brief Computes a device-wide exclusive prefix sum.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the exclusive prefix sum of a device vector of \p int items.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input and output
* int *d_scan_input, *d_scan_output;
* int num_items = ...
*
* ...
*
* // Determine temporary device storage requirements for exclusive prefix sum
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
*
* // Allocate temporary storage for exclusive prefix sum
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run exclusive prefix sum
* cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
*
* \endcode
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t ExclusiveSum(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
}
/**
* \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.
*
* \par
* Supports non-commutative scan operators.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the exclusive prefix scan of a device vector of \p int items.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input and output
* int *d_scan_input, *d_scan_output;
* int num_items = ...
*
* ...
*
* // Determine temporary device storage requirements for exclusive prefix scan
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
*
* // Allocate temporary storage for exclusive prefix scan
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run exclusive prefix scan (max)
* cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
*
* \endcode
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
* \tparam Identity <b>[inferred]</b> Type of the \p identity value used Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename ScanOp,
typename Identity>
__host__ __device__ __forceinline__
static cudaError_t ExclusiveScan(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
Identity identity, ///< [in] Identity element
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, identity, num_items, stream, stream_synchronous);
}
//@} end member group
/******************************************************************//**
* \name Inclusive scans
*********************************************************************/
//@{
/**
* \brief Computes a device-wide inclusive prefix sum.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the inclusive prefix sum of a device vector of \p int items.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input and output
* int *d_scan_input, *d_scan_output;
* int num_items = ...
* ...
*
* // Determine temporary device storage requirements for inclusive prefix sum
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
*
* // Allocate temporary storage for inclusive prefix sum
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run inclusive prefix sum
* cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
*
* \endcode
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA>
__host__ __device__ __forceinline__
static cudaError_t InclusiveSum(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream, stream_synchronous);
}
/**
* \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
*
* \par
* Supports non-commutative scan operators.
*
* \devicestorage
*
* \cdp
*
* \iterator
*
* \par
* The code snippet below illustrates the inclusive prefix scan of a device vector of \p int items.
* \par
* \code
* #include <cub/cub.cuh>
* ...
*
* // Declare and initialize device pointers for input and output
* int *d_scan_input, *d_scan_output;
* int num_items = ...
* ...
*
* // Determine temporary device storage requirements for inclusive prefix scan
* void *d_temp_storage = NULL;
* size_t temp_storage_bytes = 0;
* cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
*
* // Allocate temporary storage for inclusive prefix scan
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run inclusive prefix scan (max)
* cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
*
* \endcode
*
* \tparam InputIteratorRA <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
* \tparam OutputIteratorRA <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename InputIteratorRA,
typename OutputIteratorRA,
typename ScanOp>
__host__ __device__ __forceinline__
static cudaError_t InclusiveScan(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \p d_temp_storage allocation.
InputIteratorRA d_in, ///< [in] Iterator pointing to scan input
OutputIteratorRA d_out, ///< [in] Iterator pointing to scan output
ScanOp scan_op, ///< [in] Binary scan operator
int num_items, ///< [in] Total number of items to scan
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool stream_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream, stream_synchronous);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,211 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
*/
#pragma once
#include "../util_debug.cuh"
#include "../util_namespace.cuh"
#include "../thread/thread_load.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
*/
class GridBarrier
{
protected :
typedef unsigned int SyncFlag;
// Counters in global device memory
SyncFlag* d_sync;
public:
/**
* Constructor
*/
GridBarrier() : d_sync(NULL) {}
/**
* Synchronize
*/
__device__ __forceinline__ void Sync() const
{
volatile SyncFlag *d_vol_sync = d_sync;
// Threadfence and syncthreads to make sure global writes are visible before
// thread-0 reports in with its sync counter
__threadfence();
__syncthreads();
if (blockIdx.x == 0)
{
// Report in ourselves
if (threadIdx.x == 0)
{
d_vol_sync[blockIdx.x] = 1;
}
__syncthreads();
// Wait for everyone else to report in
for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
{
while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
{
__threadfence_block();
}
}
__syncthreads();
// Let everyone know it's safe to proceed
for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
{
d_vol_sync[peer_block] = 0;
}
}
else
{
if (threadIdx.x == 0)
{
// Report in
d_vol_sync[blockIdx.x] = 1;
// Wait for acknowledgment
while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
{
__threadfence_block();
}
}
__syncthreads();
}
}
};
/**
* \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
*
* Uses RAII for lifetime, i.e., device resources are reclaimed when
* the destructor is called.
*/
class GridBarrierLifetime : public GridBarrier
{
protected:
// Number of bytes backed by d_sync
size_t sync_bytes;
public:
/**
* Constructor
*/
GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
/**
* DeviceFrees and resets the progress counters
*/
cudaError_t HostReset()
{
cudaError_t retval = cudaSuccess;
if (d_sync)
{
CubDebug(retval = cudaFree(d_sync));
d_sync = NULL;
}
sync_bytes = 0;
return retval;
}
/**
* Destructor
*/
virtual ~GridBarrierLifetime()
{
HostReset();
}
/**
* Sets up the progress counters for the next kernel launch (lazily
* allocating and initializing them if necessary)
*/
cudaError_t Setup(int sweep_grid_size)
{
cudaError_t retval = cudaSuccess;
do {
size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
if (new_sync_bytes > sync_bytes)
{
if (d_sync)
{
if (CubDebug(retval = cudaFree(d_sync))) break;
}
sync_bytes = new_sync_bytes;
// Allocate and initialize to zero
if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
}
} while (0);
return retval;
}
};
/** @} */ // end group GridModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,197 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
*/
#pragma once
#include "../util_namespace.cuh"
#include "../util_macro.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
*
* \par Overview
* GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
* Threadblocks may receive one of three different amounts of work: "big", "normal",
* and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit
* for the last threadblock may be partially-full if the input is not an even multiple of
* the scheduling grain size.
*
* \par
* Before invoking a child grid, a parent thread will typically construct and initialize an instance of
* GridEvenShare using \p GridInit(). The instance can be passed to child threadblocks which can
* initialize their per-threadblock offsets using \p BlockInit().
*
* \tparam SizeT Integer type for array indexing
*/
template <typename SizeT>
class GridEvenShare
{
private:
SizeT total_grains;
int big_blocks;
SizeT big_share;
SizeT normal_share;
SizeT normal_base_offset;
public:
/// Total number of input items
SizeT num_items;
/// Grid size in threadblocks
int grid_size;
/// Offset into input marking the beginning of the owning thread block's segment of input tiles
SizeT block_offset;
/// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
SizeT block_oob;
/**
* \brief Block-based constructor for single-block grids.
*/
__device__ __forceinline__ GridEvenShare(SizeT num_items) :
num_items(num_items),
grid_size(1),
block_offset(0),
block_oob(num_items) {}
/**
* \brief Default constructor. Zero-initializes block-specific fields.
*/
__host__ __device__ __forceinline__ GridEvenShare() :
num_items(0),
grid_size(0),
block_offset(0),
block_oob(0) {}
/**
* \brief Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
*/
__host__ __device__ __forceinline__ void GridInit(
SizeT num_items, ///< Total number of input items
int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof.
{
this->num_items = num_items;
this->block_offset = 0;
this->block_oob = 0;
this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity;
this->grid_size = CUB_MIN(total_grains, max_grid_size);
SizeT grains_per_block = total_grains / grid_size;
this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks
this->normal_share = grains_per_block * schedule_granularity;
this->normal_base_offset = big_blocks * schedule_granularity;
this->big_share = normal_share + schedule_granularity;
}
/**
* \brief Initializes the threadblock-specific details (e.g., to be called by each threadblock after startup)
*/
__device__ __forceinline__ void BlockInit()
{
if (blockIdx.x < big_blocks)
{
// This threadblock gets a big share of grains (grains_per_block + 1)
block_offset = (blockIdx.x * big_share);
block_oob = block_offset + big_share;
}
else if (blockIdx.x < total_grains)
{
// This threadblock gets a normal share of grains (grains_per_block)
block_offset = normal_base_offset + (blockIdx.x * normal_share);
block_oob = block_offset + normal_share;
}
// Last threadblock
if (blockIdx.x == grid_size - 1)
{
block_oob = num_items;
}
}
/**
* Print to stdout
*/
__host__ __device__ __forceinline__ void Print()
{
printf(
#ifdef __CUDA_ARCH__
"\tthreadblock(%d) "
"block_offset(%lu) "
"block_oob(%lu) "
#endif
"num_items(%lu) "
"total_grains(%lu) "
"big_blocks(%lu) "
"big_share(%lu) "
"normal_share(%lu)\n",
#ifdef __CUDA_ARCH__
blockIdx.x,
(unsigned long) block_offset,
(unsigned long) block_oob,
#endif
(unsigned long) num_items,
(unsigned long) total_grains,
(unsigned long) big_blocks,
(unsigned long) big_share,
(unsigned long) normal_share);
}
};
/** @} */ // end group GridModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,95 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
*/
#pragma once
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup GridModule
* @{
*/
/******************************************************************************
* Mapping policies
*****************************************************************************/
/**
* \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
*/
enum GridMappingStrategy
{
/**
* \brief An "even-share" strategy for assigning input tiles to thread blocks.
*
* \par Overview
* The input is evenly partitioned into \p p segments, where \p p is
* constant and corresponds loosely to the number of thread blocks that may
* actively reside on the target device. Each segment is comprised of
* consecutive tiles, where a tile is a small, constant-sized unit of input
* to be processed to completion before the thread block terminates or
* obtains more work. The kernel invokes \p p thread blocks, each
* of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
* in tile-size increments.
*/
GRID_MAPPING_EVEN_SHARE,
/**
* \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
*
* \par Overview
* The input is treated as a queue to be dynamically consumed by a grid of
* thread blocks. Work is atomically dequeued in tiles, where a tile is a
* unit of input to be processed to completion before the thread block
* terminates or obtains more work. The grid size \p p is constant,
* loosely corresponding to the number of thread blocks that may actively
* reside on the target device.
*/
GRID_MAPPING_DYNAMIC,
};
/** @} */ // end group GridModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,207 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::GridQueue is a descriptor utility for dynamic queue management.
*/
#pragma once
#include "../util_namespace.cuh"
#include "../util_debug.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup GridModule
* @{
*/
/**
* \brief GridQueue is a descriptor utility for dynamic queue management.
*
* \par Overview
* GridQueue descriptors provides abstractions for "filling" or
* "draining" globally-shared vectors.
*
* \par
* A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
* returning a unique offset for the calling thread to write its items.
* The GridQueue maintains the total "fill-size". The fill counter must be reset
* using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
* will be filling.
*
* \par
* Similarly a "draining" GridQueue works by works by atomically-incrementing a
* zero-initialized counter, returning a unique offset for the calling thread to
* read its items. Threads can safely drain until the array's logical fill-size is
* exceeded. The drain counter must be reset using GridQueue::ResetDrain or
* GridQueue::ResetDrainAfterFill by the host or kernel instance prior to the kernel instance that
* will be filling. (For dynamic work distribution of existing data, the corresponding fill-size
* is simply the number of elements in the array.)
*
* \par
* Iterative work management can be implemented simply with a pair of flip-flopping
* work buffers, each with an associated set of fill and drain GridQueue descriptors.
*
* \tparam SizeT Integer type for array indexing
*/
template <typename SizeT>
class GridQueue
{
private:
/// Counter indices
enum
{
FILL = 0,
DRAIN = 1,
};
/// Pair of counters
SizeT *d_counters;
public:
/// Returns the device allocation size in bytes needed to construct a GridQueue instance
__host__ __device__ __forceinline__
static size_t AllocationSize()
{
return sizeof(SizeT) * 2;
}
/// Constructs an invalid GridQueue descriptor around the device storage allocation
__host__ __device__ __forceinline__ GridQueue(
void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as <tt>AllocationSize()</tt>.
:
d_counters((SizeT*) d_storage)
{}
/// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.
__host__ __device__ __forceinline__ cudaError_t ResetDrainAfterFill(cudaStream_t stream = 0)
{
#ifdef __CUDA_ARCH__
d_counters[DRAIN] = 0;
return cudaSuccess;
#else
return ResetDrain(0, stream);
#endif
}
/// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining.
__host__ __device__ __forceinline__ cudaError_t ResetDrain(
SizeT fill_size,
cudaStream_t stream = 0)
{
#ifdef __CUDA_ARCH__
d_counters[FILL] = fill_size;
d_counters[DRAIN] = 0;
return cudaSuccess;
#else
SizeT counters[2];
counters[FILL] = fill_size;
counters[DRAIN] = 0;
return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(SizeT) * 2, cudaMemcpyHostToDevice, stream));
#endif
}
/// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling.
__host__ __device__ __forceinline__ cudaError_t ResetFill()
{
#ifdef __CUDA_ARCH__
d_counters[FILL] = 0;
return cudaSuccess;
#else
return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(SizeT)));
#endif
}
/// Returns the fill-size established by the parent or by the previous kernel.
__host__ __device__ __forceinline__ cudaError_t FillSize(
SizeT &fill_size,
cudaStream_t stream = 0)
{
#ifdef __CUDA_ARCH__
fill_size = d_counters[FILL];
#else
return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(SizeT), cudaMemcpyDeviceToHost, stream));
#endif
}
/// Drain num_items. Returns offset from which to read items.
__device__ __forceinline__ SizeT Drain(SizeT num_items)
{
return atomicAdd(d_counters + DRAIN, num_items);
}
/// Fill num_items. Returns offset from which to write items.
__device__ __forceinline__ SizeT Fill(SizeT num_items)
{
return atomicAdd(d_counters + FILL, num_items);
}
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Reset grid queue (call with 1 block of 1 thread)
*/
template <typename SizeT>
__global__ void ResetDrainKernel(
GridQueue<SizeT> grid_queue,
SizeT num_items)
{
grid_queue.ResetDrain(num_items);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group GridModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,123 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
*/
#pragma once
#if defined(_WIN32) || defined(_WIN64)
#include <intrin.h>
#include <windows.h>
#undef small // Windows is terrible for polluting macro namespace
/**
* Compiler read/write barrier
*/
#pragma intrinsic(_ReadWriteBarrier)
#endif
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
#if defined(_MSC_VER)
// Microsoft VC++
typedef long Spinlock;
#else
// GNU g++
typedef int Spinlock;
/**
* Compiler read/write barrier
*/
__forceinline__ void _ReadWriteBarrier()
{
__sync_synchronize();
}
/**
* Atomic exchange
*/
__forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
{
// NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
_ReadWriteBarrier();
return __sync_lock_test_and_set(Target, Value);
}
/**
* Pause instruction to prevent excess processor bus usage
*/
__forceinline__ void YieldProcessor()
{
#ifndef __arm__
asm volatile("pause\n": : :"memory");
#endif // __arm__
}
#endif // defined(_MSC_VER)
/**
* Return when the specified spinlock has been acquired
*/
__forceinline__ void Lock(volatile Spinlock *lock)
{
while (1)
{
if (!_InterlockedExchange(lock, 1)) return;
while (*lock) YieldProcessor();
}
}
/**
* Release the specified spinlock
*/
__forceinline__ void Unlock(volatile Spinlock *lock)
{
_ReadWriteBarrier();
*lock = 0;
}
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,429 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Thread utilities for reading memory using PTX cache modifiers.
*/
#pragma once
#include <cuda.h>
#include <iterator>
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup IoModule
* @{
*/
//-----------------------------------------------------------------------------
// Tags and constants
//-----------------------------------------------------------------------------
/**
* \brief Enumeration of PTX cache-modifiers for memory load operations.
*/
enum PtxLoadModifier
{
LOAD_DEFAULT, ///< Default (no modifier)
LOAD_CA, ///< Cache at all levels
LOAD_CG, ///< Cache at global level
LOAD_CS, ///< Cache streaming (likely to be accessed once)
LOAD_CV, ///< Cache as volatile (including cached system lines)
LOAD_LDG, ///< Cache as texture
LOAD_VOLATILE, ///< Volatile (any memory space)
};
/**
* \name Simple I/O
* @{
*/
/**
* \brief Thread utility for reading memory using cub::PtxLoadModifier cache modifiers.
*
* Cache modifiers will only be effected for built-in types (i.e., C++
* primitives and CUDA vector-types).
*
* For example:
* \par
* \code
* #include <cub/cub.cuh>
*
* // 32-bit load using cache-global modifier:
* int *d_in;
* int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
*
* // 16-bit load using default modifier
* short *d_in;
* short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
*
* // 256-bit load using cache-volatile modifier
* double4 *d_in;
* double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
*
* // 96-bit load using default cache modifier (ignoring LOAD_CS)
* struct TestFoo { bool a; short b; };
* TestFoo *d_struct;
* TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
* \endcode
*
*/
template <
PtxLoadModifier MODIFIER,
typename InputIteratorRA>
__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr);
//@} end member group
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Define a int4 (16B) ThreadLoad specialization for the given PTX load modifier
*/
#define CUB_LOAD_16(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ int4 ThreadLoad<cub_modifier, int4*>(int4* ptr) \
{ \
int4 retval; \
asm volatile ("ld."#ptx_modifier".v4.s32 {%0, %1, %2, %3}, [%4];" : \
"=r"(retval.x), \
"=r"(retval.y), \
"=r"(retval.z), \
"=r"(retval.w) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ longlong2 ThreadLoad<cub_modifier, longlong2*>(longlong2* ptr) \
{ \
longlong2 retval; \
asm volatile ("ld."#ptx_modifier".v2.s64 {%0, %1}, [%2];" : \
"=l"(retval.x), \
"=l"(retval.y) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define a int2 (8B) ThreadLoad specialization for the given PTX load modifier
*/
#define CUB_LOAD_8(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ short4 ThreadLoad<cub_modifier, short4*>(short4* ptr) \
{ \
short4 retval; \
asm volatile ("ld."#ptx_modifier".v4.s16 {%0, %1, %2, %3}, [%4];" : \
"=h"(retval.x), \
"=h"(retval.y), \
"=h"(retval.z), \
"=h"(retval.w) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ int2 ThreadLoad<cub_modifier, int2*>(int2* ptr) \
{ \
int2 retval; \
asm volatile ("ld."#ptx_modifier".v2.s32 {%0, %1}, [%2];" : \
"=r"(retval.x), \
"=r"(retval.y) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ long long ThreadLoad<cub_modifier, long long*>(long long* ptr) \
{ \
long long retval; \
asm volatile ("ld."#ptx_modifier".s64 %0, [%1];" : \
"=l"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define a int (4B) ThreadLoad specialization for the given PTX load modifier
*/
#define CUB_LOAD_4(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ int ThreadLoad<cub_modifier, int*>(int* ptr) \
{ \
int retval; \
asm volatile ("ld."#ptx_modifier".s32 %0, [%1];" : \
"=r"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define a short (2B) ThreadLoad specialization for the given PTX load modifier
*/
#define CUB_LOAD_2(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ short ThreadLoad<cub_modifier, short*>(short* ptr) \
{ \
short retval; \
asm volatile ("ld."#ptx_modifier".s16 %0, [%1];" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define a char (1B) ThreadLoad specialization for the given PTX load modifier
*/
#define CUB_LOAD_1(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ char ThreadLoad<cub_modifier, char*>(char* ptr) \
{ \
short retval; \
asm volatile ( \
"{" \
" .reg .s8 datum;" \
" ld."#ptx_modifier".s8 datum, [%1];" \
" cvt.s16.s8 %0, datum;" \
"}" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return (char) retval; \
}
/**
* Define powers-of-two ThreadLoad specializations for the given PTX load modifier
*/
#define CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
CUB_LOAD_16(cub_modifier, ptx_modifier) \
CUB_LOAD_8(cub_modifier, ptx_modifier) \
CUB_LOAD_4(cub_modifier, ptx_modifier) \
CUB_LOAD_2(cub_modifier, ptx_modifier) \
CUB_LOAD_1(cub_modifier, ptx_modifier) \
/**
* Define ThreadLoad specializations for the various PTX load modifiers
*/
#if CUB_PTX_ARCH >= 200
CUB_LOAD_ALL(LOAD_CA, ca)
CUB_LOAD_ALL(LOAD_CG, cg)
CUB_LOAD_ALL(LOAD_CS, cs)
CUB_LOAD_ALL(LOAD_CV, cv)
#else
// LOAD_CV on SM10-13 uses "volatile.global" to ensure reads from last level
CUB_LOAD_ALL(LOAD_CV, volatile.global)
#endif
#if CUB_PTX_ARCH >= 350
CUB_LOAD_ALL(LOAD_LDG, global.nc)
#endif
/// Helper structure for templated load iteration (inductive case)
template <PtxLoadModifier MODIFIER, int COUNT, int MAX>
struct IterateThreadLoad
{
template <typename T>
static __device__ __forceinline__ void Load(T *ptr, T *vals)
{
vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
IterateThreadLoad<MODIFIER, COUNT + 1, MAX>::Load(ptr, vals);
}
};
/// Helper structure for templated load iteration (termination case)
template <PtxLoadModifier MODIFIER, int MAX>
struct IterateThreadLoad<MODIFIER, MAX, MAX>
{
template <typename T>
static __device__ __forceinline__ void Load(T *ptr, T *vals) {}
};
/**
* Load with LOAD_DEFAULT on iterator types
*/
template <typename InputIteratorRA>
__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(
InputIteratorRA itr,
Int2Type<LOAD_DEFAULT> modifier,
Int2Type<false> is_pointer)
{
return *itr;
}
/**
* Load with LOAD_DEFAULT on pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoad(
T *ptr,
Int2Type<LOAD_DEFAULT> modifier,
Int2Type<true> is_pointer)
{
return *ptr;
}
/**
* Load with LOAD_VOLATILE on primitive pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoadVolatile(
T *ptr,
Int2Type<true> is_primitive)
{
T retval = *reinterpret_cast<volatile T*>(ptr);
#if (CUB_PTX_ARCH <= 130)
if (sizeof(T) == 1) __threadfence_block();
#endif
return retval;
}
/**
* Load with LOAD_VOLATILE on non-primitive pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoadVolatile(
T *ptr,
Int2Type<false> is_primitive)
{
typedef typename WordAlignment<T>::VolatileWord VolatileWord; // Word type for memcopying
enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
// Memcopy from aliased source into array of uninitialized words
typename WordAlignment<T>::UninitializedVolatileWords words;
#pragma unroll
for (int i = 0; i < NUM_WORDS; ++i)
words.buf[i] = reinterpret_cast<volatile VolatileWord*>(ptr)[i];
// Load from words
return *reinterpret_cast<T*>(words.buf);
}
/**
* Load with LOAD_VOLATILE on pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoad(
T *ptr,
Int2Type<LOAD_VOLATILE> modifier,
Int2Type<true> is_pointer)
{
return ThreadLoadVolatile(ptr, Int2Type<Traits<T>::PRIMITIVE>());
}
#if (CUB_PTX_ARCH <= 130)
/**
* Load with LOAD_CG uses LOAD_CV in pre-SM20 PTX to ensure coherent reads when run on newer architectures with L1
*/
template <typename T>
__device__ __forceinline__ T ThreadLoad(
T *ptr,
Int2Type<LOAD_CG> modifier,
Int2Type<true> is_pointer)
{
return ThreadLoad<LOAD_CV>(ptr);
}
#endif // (CUB_PTX_ARCH <= 130)
/**
* Load with arbitrary MODIFIER on pointer types
*/
template <typename T, int MODIFIER>
__device__ __forceinline__ T ThreadLoad(
T *ptr,
Int2Type<MODIFIER> modifier,
Int2Type<true> is_pointer)
{
typedef typename WordAlignment<T>::DeviceWord DeviceWord;
enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
// Memcopy from aliased source into array of uninitialized words
typename WordAlignment<T>::UninitializedDeviceWords words;
IterateThreadLoad<PtxLoadModifier(MODIFIER), 0, NUM_WORDS>::Load(
reinterpret_cast<DeviceWord*>(ptr),
words.buf);
// Load from words
return *reinterpret_cast<T*>(words.buf);
}
/**
* Generic ThreadLoad definition
*/
template <
PtxLoadModifier MODIFIER,
typename InputIteratorRA>
__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr)
{
return ThreadLoad(
itr,
Int2Type<MODIFIER>(),
Int2Type<IsPointer<InputIteratorRA>::VALUE>());
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group IoModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,145 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Simple binary operator functor types
*/
/******************************************************************************
* Simple functor operators
******************************************************************************/
#pragma once
#include "../util_macro.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup ThreadModule
* @{
*/
/**
* \brief Default equality functor
*/
struct Equality
{
/// Boolean equality operator, returns <tt>(a == b)</tt>
template <typename T>
__host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
{
return a == b;
}
};
/**
* \brief Default inequality functor
*/
struct Inequality
{
/// Boolean inequality operator, returns <tt>(a != b)</tt>
template <typename T>
__host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
{
return a != b;
}
};
/**
* \brief Default sum functor
*/
struct Sum
{
/// Boolean sum operator, returns <tt>a + b</tt>
template <typename T>
__host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
{
return a + b;
}
};
/**
* \brief Default max functor
*/
struct Max
{
/// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
template <typename T>
__host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
{
return CUB_MAX(a, b);
}
};
/**
* \brief Default min functor
*/
struct Min
{
/// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
template <typename T>
__host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
{
return CUB_MIN(a, b);
}
};
/**
* \brief Default cast functor
*/
template <typename B>
struct Cast
{
/// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
template <typename A>
__host__ __device__ __forceinline__ B operator()(const A &a)
{
return (B) a;
}
};
/** @} */ // end group ThreadModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,145 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Thread utilities for sequential reduction over statically-sized array types
*/
#pragma once
#include "../thread/thread_operators.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup ThreadModule
* @{
*/
/**
* \name Sequential reduction over statically-sized array types
* @{
*/
/**
* \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH Length of input array
* \tparam T <b>[inferred]</b> The data type to be reduced.
* \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ReductionOp>
__device__ __forceinline__ T ThreadReduce(
T* input, ///< [in] Input array
ReductionOp reduction_op, ///< [in] Binary reduction operator
T prefix) ///< [in] Prefix to seed reduction with
{
#pragma unroll
for (int i = 0; i < LENGTH; ++i)
{
prefix = reduction_op(prefix, input[i]);
}
return prefix;
}
/**
* \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned.
*
* \tparam LENGTH Length of input array
* \tparam T <b>[inferred]</b> The data type to be reduced.
* \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ReductionOp>
__device__ __forceinline__ T ThreadReduce(
T* input, ///< [in] Input array
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
T prefix = input[0];
return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
}
/**
* \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH <b>[inferred]</b> Length of \p input array
* \tparam T <b>[inferred]</b> The data type to be reduced.
* \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ReductionOp>
__device__ __forceinline__ T ThreadReduce(
T (&input)[LENGTH], ///< [in] Input array
ReductionOp reduction_op, ///< [in] Binary reduction operator
T prefix) ///< [in] Prefix to seed reduction with
{
return ThreadReduce<LENGTH>(input, reduction_op, prefix);
}
/**
* \brief Serial reduction with the specified operator
*
* \tparam LENGTH <b>[inferred]</b> Length of \p input array
* \tparam T <b>[inferred]</b> The data type to be reduced.
* \tparam ScanOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ReductionOp>
__device__ __forceinline__ T ThreadReduce(
T (&input)[LENGTH], ///< [in] Input array
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
return ThreadReduce<LENGTH>((T*) input, reduction_op);
}
//@} end member group
/** @} */ // end group ThreadModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,231 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Thread utilities for sequential prefix scan over statically-sized array types
*/
#pragma once
#include "../thread/thread_operators.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup ThreadModule
* @{
*/
/**
* \name Sequential prefix scan over statically-sized array types
* @{
*/
/**
* \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanExclusive(
T *input, ///< [in] Input array
T *output, ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.)
{
T inclusive = input[0];
if (apply_prefix)
{
inclusive = scan_op(prefix, inclusive);
}
output[0] = prefix;
T exclusive = inclusive;
#pragma unroll
for (int i = 1; i < LENGTH; ++i)
{
inclusive = scan_op(exclusive, input[i]);
output[i] = exclusive;
exclusive = inclusive;
}
return inclusive;
}
/**
* \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanExclusive(
T (&input)[LENGTH], ///< [in] Input array
T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
{
return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix);
}
/**
* \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned.
*
* \tparam LENGTH Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanInclusive(
T *input, ///< [in] Input array
T *output, ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op) ///< [in] Binary scan operator
{
T inclusive = input[0];
output[0] = inclusive;
// Continue scan
#pragma unroll
for (int i = 0; i < LENGTH; ++i)
{
inclusive = scan_op(inclusive, input[i]);
output[i] = inclusive;
}
return inclusive;
}
/**
* \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned.
*
* \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanInclusive(
T (&input)[LENGTH], ///< [in] Input array
T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op) ///< [in] Binary scan operator
{
return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
}
/**
* \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanInclusive(
T *input, ///< [in] Input array
T *output, ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
{
T inclusive = input[0];
if (apply_prefix)
{
inclusive = scan_op(prefix, inclusive);
}
output[0] = inclusive;
// Continue scan
#pragma unroll
for (int i = 1; i < LENGTH; ++i)
{
inclusive = scan_op(inclusive, input[i]);
output[i] = inclusive;
}
return inclusive;
}
/**
* \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
*
* \tparam LENGTH <b>[inferred]</b> Length of \p input and \p output arrays
* \tparam T <b>[inferred]</b> The data type to be scanned.
* \tparam ScanOp <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
int LENGTH,
typename T,
typename ScanOp>
__device__ __forceinline__ T ThreadScanInclusive(
T (&input)[LENGTH], ///< [in] Input array
T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input)
ScanOp scan_op, ///< [in] Binary scan operator
T prefix, ///< [in] Prefix to seed scan with
bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.)
{
return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
}
//@} end member group
/** @} */ // end group ThreadModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,412 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Thread utilities for writing memory using PTX cache modifiers.
*/
#pragma once
#include <cuda.h>
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup IoModule
* @{
*/
//-----------------------------------------------------------------------------
// Tags and constants
//-----------------------------------------------------------------------------
/**
* \brief Enumeration of PTX cache-modifiers for memory store operations.
*/
enum PtxStoreModifier
{
STORE_DEFAULT, ///< Default (no modifier)
STORE_WB, ///< Cache write-back all coherent levels
STORE_CG, ///< Cache at global level
STORE_CS, ///< Cache streaming (likely to be accessed once)
STORE_WT, ///< Cache write-through (to system memory)
STORE_VOLATILE, ///< Volatile shared (any memory space)
};
/**
* \name Simple I/O
* @{
*/
/**
* \brief Thread utility for writing memory using cub::PtxStoreModifier cache modifiers.
*
* Cache modifiers will only be effected for built-in types (i.e., C++
* primitives and CUDA vector-types).
*
* For example:
* \par
* \code
* #include <cub/cub.cuh>
*
* // 32-bit store using cache-global modifier:
* int *d_out;
* int val;
* cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
*
* // 16-bit store using default modifier
* short *d_out;
* short val;
* cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
*
* // 256-bit store using write-through modifier
* double4 *d_out;
* double4 val;
* cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
*
* // 96-bit store using default cache modifier (ignoring STORE_CS)
* struct TestFoo { bool a; short b; };
* TestFoo *d_struct;
* TestFoo val;
* cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
* \endcode
*
*/
template <
PtxStoreModifier MODIFIER,
typename OutputIteratorRA,
typename T>
__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val);
//@} end member group
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Define a int4 (16B) ThreadStore specialization for the given PTX load modifier
*/
#define CUB_STORE_16(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, int4*, int4>(int4* ptr, int4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.s32 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y), \
"r"(val.z), \
"r"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, longlong2*, longlong2>(longlong2* ptr, longlong2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.s64 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val.x), \
"l"(val.y)); \
}
/**
* Define a int2 (8B) ThreadStore specialization for the given PTX load modifier
*/
#define CUB_STORE_8(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, short4*, short4>(short4* ptr, short4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.s16 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val.x), \
"h"(val.y), \
"h"(val.z), \
"h"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, int2*, int2>(int2* ptr, int2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.s32 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, long long*, long long>(long long* ptr, long long val) \
{ \
asm volatile ("st."#ptx_modifier".s64 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val)); \
}
/**
* Define a int (4B) ThreadStore specialization for the given PTX load modifier
*/
#define CUB_STORE_4(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, int*, int>(int* ptr, int val) \
{ \
asm volatile ("st."#ptx_modifier".s32 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val)); \
}
/**
* Define a short (2B) ThreadStore specialization for the given PTX load modifier
*/
#define CUB_STORE_2(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, short*, short>(short* ptr, short val) \
{ \
asm volatile ("st."#ptx_modifier".s16 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val)); \
}
/**
* Define a char (1B) ThreadStore specialization for the given PTX load modifier
*/
#define CUB_STORE_1(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, char*, char>(char* ptr, char val) \
{ \
asm volatile ( \
"{" \
" .reg .s8 datum;" \
" cvt.s8.s16 datum, %1;" \
" st."#ptx_modifier".s8 [%0], datum;" \
"}" : : \
_CUB_ASM_PTR_(ptr), \
"h"(short(val))); \
}
/**
* Define powers-of-two ThreadStore specializations for the given PTX load modifier
*/
#define CUB_STORE_ALL(cub_modifier, ptx_modifier) \
CUB_STORE_16(cub_modifier, ptx_modifier) \
CUB_STORE_8(cub_modifier, ptx_modifier) \
CUB_STORE_4(cub_modifier, ptx_modifier) \
CUB_STORE_2(cub_modifier, ptx_modifier) \
CUB_STORE_1(cub_modifier, ptx_modifier) \
/**
* Define ThreadStore specializations for the various PTX load modifiers
*/
#if CUB_PTX_ARCH >= 200
CUB_STORE_ALL(STORE_WB, ca)
CUB_STORE_ALL(STORE_CG, cg)
CUB_STORE_ALL(STORE_CS, cs)
CUB_STORE_ALL(STORE_WT, cv)
#else
// STORE_WT on SM10-13 uses "volatile.global" to ensure writes to last level
CUB_STORE_ALL(STORE_WT, volatile.global)
#endif
/// Helper structure for templated store iteration (inductive case)
template <PtxStoreModifier MODIFIER, int COUNT, int MAX>
struct IterateThreadStore
{
template <typename T>
static __device__ __forceinline__ void Store(T *ptr, T *vals)
{
ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
IterateThreadStore<MODIFIER, COUNT + 1, MAX>::Store(ptr, vals);
}
};
/// Helper structure for templated store iteration (termination case)
template <PtxStoreModifier MODIFIER, int MAX>
struct IterateThreadStore<MODIFIER, MAX, MAX>
{
template <typename T>
static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
};
/**
* Store with STORE_DEFAULT on iterator types
*/
template <typename OutputIteratorRA, typename T>
__device__ __forceinline__ void ThreadStore(
OutputIteratorRA itr,
T val,
Int2Type<STORE_DEFAULT> modifier,
Int2Type<false> is_pointer)
{
*itr = val;
}
/**
* Store with STORE_DEFAULT on pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStore(
T *ptr,
T val,
Int2Type<STORE_DEFAULT> modifier,
Int2Type<true> is_pointer)
{
*ptr = val;
}
/**
* Store with STORE_VOLATILE on primitive pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStoreVolatile(
T *ptr,
T val,
Int2Type<true> is_primitive)
{
*reinterpret_cast<volatile T*>(ptr) = val;
}
/**
* Store with STORE_VOLATILE on non-primitive pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStoreVolatile(
T *ptr,
T val,
Int2Type<false> is_primitive)
{
typedef typename WordAlignment<T>::VolatileWord VolatileWord; // Word type for memcopying
enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
// Store into array of uninitialized words
typename WordAlignment<T>::UninitializedVolatileWords words;
*reinterpret_cast<T*>(words.buf) = val;
// Memcopy words to aliased destination
#pragma unroll
for (int i = 0; i < NUM_WORDS; ++i)
reinterpret_cast<volatile VolatileWord*>(ptr)[i] = words.buf[i];
}
/**
* Store with STORE_VOLATILE on pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStore(
T *ptr,
T val,
Int2Type<STORE_VOLATILE> modifier,
Int2Type<true> is_pointer)
{
ThreadStoreVolatile(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
}
#if (CUB_PTX_ARCH <= 350)
/**
* Store with STORE_CG on pointer types (uses STORE_DEFAULT on current architectures)
*/
template <typename T>
__device__ __forceinline__ void ThreadStore(
T *ptr,
T val,
Int2Type<STORE_CG> modifier,
Int2Type<true> is_pointer)
{
ThreadStore<STORE_DEFAULT>(ptr, val);
}
#endif // (CUB_PTX_ARCH <= 350)
/**
* Store with arbitrary MODIFIER on pointer types
*/
template <typename T, int MODIFIER>
__device__ __forceinline__ void ThreadStore(
T *ptr,
T val,
Int2Type<MODIFIER> modifier,
Int2Type<true> is_pointer)
{
typedef typename WordAlignment<T>::DeviceWord DeviceWord; // Word type for memcopying
enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
// Store into array of uninitialized words
typename WordAlignment<T>::UninitializedDeviceWords words;
*reinterpret_cast<T*>(words.buf) = val;
// Memcopy words to aliased destination
IterateThreadStore<PtxStoreModifier(MODIFIER), 0, NUM_WORDS>::Store(
reinterpret_cast<DeviceWord*>(ptr),
words.buf);
}
/**
* Generic ThreadStore definition
*/
template <PtxStoreModifier MODIFIER, typename OutputIteratorRA, typename T>
__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val)
{
ThreadStore(
itr,
val,
Int2Type<MODIFIER>(),
Int2Type<IsPointer<OutputIteratorRA>::VALUE>());
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group IoModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,661 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Simple caching allocator for device memory allocations. The allocator is
* thread-safe and capable of managing device allocations on multiple devices.
******************************************************************************/
#pragma once
#ifndef __CUDA_ARCH__
#include <set> // NVCC (EDG, really) takes FOREVER to compile std::map
#include <map>
#endif
#include <math.h>
#include "util_namespace.cuh"
#include "util_debug.cuh"
#include "host/spinlock.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/******************************************************************************
* CachingDeviceAllocator (host use)
******************************************************************************/
/**
* \brief A simple caching allocator for device memory allocations.
*
* \par Overview
* The allocator is thread-safe and is capable of managing cached device allocations
* on multiple devices. It behaves as follows:
*
* \par
* - Allocations categorized by bin size.
* - Bin sizes progress geometrically in accordance with the growth factor
* \p bin_growth provided during construction. Unused device allocations within
* a larger bin cache are not reused for allocation requests that categorize to
* smaller bin sizes.
* - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
* (\p bin_growth ^ \p min_bin).
* - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
* bin and are simply freed when they are deallocated instead of being returned
* to a bin-cache.
* - %If the total storage of cached allocations on a given device will exceed
* \p max_cached_bytes, allocations for that device are simply freed when they are
* deallocated instead of being returned to their bin-cache.
*
* \par
* For example, the default-constructed CachingDeviceAllocator is configured with:
* - \p bin_growth = 8
* - \p min_bin = 3
* - \p max_bin = 7
* - \p max_cached_bytes = 6MB - 1B
*
* \par
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
* and sets a maximum of 6,291,455 cached bytes per device
*
*/
struct CachingDeviceAllocator
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
//---------------------------------------------------------------------
// Type definitions and constants
//---------------------------------------------------------------------
enum
{
/// Invalid device ordinal
INVALID_DEVICE_ORDINAL = -1,
};
/**
* Integer pow function for unsigned base and exponent
*/
static unsigned int IntPow(
unsigned int base,
unsigned int exp)
{
unsigned int retval = 1;
while (exp > 0)
{
if (exp & 1) {
retval = retval * base; // multiply the result by the current base
}
base = base * base; // square the base
exp = exp >> 1; // divide the exponent in half
}
return retval;
}
/**
* Round up to the nearest power-of
*/
static void NearestPowerOf(
unsigned int &power,
size_t &rounded_bytes,
unsigned int base,
size_t value)
{
power = 0;
rounded_bytes = 1;
while (rounded_bytes < value)
{
rounded_bytes *= base;
power++;
}
}
/**
* Descriptor for device memory allocations
*/
struct BlockDescriptor
{
int device; // device ordinal
void* d_ptr; // Device pointer
size_t bytes; // Size of allocation in bytes
unsigned int bin; // Bin enumeration
// Constructor
BlockDescriptor(void *d_ptr, int device) :
d_ptr(d_ptr),
bytes(0),
bin(0),
device(device) {}
// Constructor
BlockDescriptor(size_t bytes, unsigned int bin, int device) :
d_ptr(NULL),
bytes(bytes),
bin(bin),
device(device) {}
// Comparison functor for comparing device pointers
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
{
if (a.device < b.device) {
return true;
} else if (a.device > b.device) {
return false;
} else {
return (a.d_ptr < b.d_ptr);
}
}
// Comparison functor for comparing allocation sizes
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
{
if (a.device < b.device) {
return true;
} else if (a.device > b.device) {
return false;
} else {
return (a.bytes < b.bytes);
}
}
};
/// BlockDescriptor comparator function interface
typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
/// Set type for cached blocks (ordered by size)
typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
/// Set type for live blocks (ordered by ptr)
typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
/// Map type of device ordinals to the number of cached bytes cached by each device
typedef std::map<int, size_t> GpuCachedBytes;
#endif // __CUDA_ARCH__
//---------------------------------------------------------------------
// Fields
//---------------------------------------------------------------------
Spinlock spin_lock; /// Spinlock for thread-safety
unsigned int bin_growth; /// Geometric growth factor for bin-sizes
unsigned int min_bin; /// Minimum bin enumeration
unsigned int max_bin; /// Maximum bin enumeration
size_t min_bin_bytes; /// Minimum bin size
size_t max_bin_bytes; /// Maximum bin size
size_t max_cached_bytes; /// Maximum aggregate cached bytes per device
bool debug; /// Whether or not to print (de)allocation events to stdout
bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators)
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device
CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse
BusyBlocks live_blocks; /// Set of live device allocations currently in use
#endif // __CUDA_ARCH__
#endif // DOXYGEN_SHOULD_SKIP_THIS
//---------------------------------------------------------------------
// Methods
//---------------------------------------------------------------------
/**
* \brief Constructor.
*/
CachingDeviceAllocator(
unsigned int bin_growth, ///< Geometric growth factor for bin-sizes
unsigned int min_bin, ///< Minimum bin
unsigned int max_bin, ///< Maximum bin
size_t max_cached_bytes) ///< Maximum aggregate cached bytes per device
:
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
cached_blocks(BlockDescriptor::SizeCompare),
live_blocks(BlockDescriptor::PtrCompare),
#endif
debug(false),
spin_lock(0),
bin_growth(bin_growth),
min_bin(min_bin),
max_bin(max_bin),
min_bin_bytes(IntPow(bin_growth, min_bin)),
max_bin_bytes(IntPow(bin_growth, max_bin)),
max_cached_bytes(max_cached_bytes)
{}
/**
* \brief Default constructor.
*
* Configured with:
* \par
* - \p bin_growth = 8
* - \p min_bin = 3
* - \p max_bin = 7
* - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
*
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
* sets a maximum of 6,291,455 cached bytes per device
*/
CachingDeviceAllocator(bool skip_cleanup = false) :
#ifndef __CUDA_ARCH__ // Only define STL container members in host code
cached_blocks(BlockDescriptor::SizeCompare),
live_blocks(BlockDescriptor::PtrCompare),
#endif
skip_cleanup(skip_cleanup),
debug(false),
spin_lock(0),
bin_growth(8),
min_bin(3),
max_bin(7),
min_bin_bytes(IntPow(bin_growth, min_bin)),
max_bin_bytes(IntPow(bin_growth, max_bin)),
max_cached_bytes((max_bin_bytes * 3) - 1)
{}
/**
* \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
*/
cudaError_t SetMaxCachedBytes(
size_t max_cached_bytes)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
// Lock
Lock(&spin_lock);
this->max_cached_bytes = max_cached_bytes;
if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
// Unlock
Unlock(&spin_lock);
return cudaSuccess;
#endif // __CUDA_ARCH__
}
/**
* \brief Provides a suitable allocation of device memory for the given size on the specified device
*/
cudaError_t DeviceAllocate(
void** d_ptr,
size_t bytes,
int device)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
bool locked = false;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;
// Round up to nearest bin size
unsigned int bin;
size_t bin_bytes;
NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
if (bin < min_bin) {
bin = min_bin;
bin_bytes = min_bin_bytes;
}
// Check if bin is greater than our maximum bin
if (bin > max_bin)
{
// Allocate the request exactly and give out-of-range bin
bin = (unsigned int) -1;
bin_bytes = bytes;
}
BlockDescriptor search_key(bin_bytes, bin, device);
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
do {
// Find a free block big enough within the same bin on the same device
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
if ((block_itr != cached_blocks.end()) &&
(block_itr->device == device) &&
(block_itr->bin == search_key.bin))
{
// Reuse existing cache block. Insert into live blocks.
search_key = *block_itr;
live_blocks.insert(search_key);
// Remove from free blocks
cached_blocks.erase(block_itr);
cached_bytes[device] -= search_key.bytes;
if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
else
{
// Need to allocate a new cache block. Unlock.
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Set to specified device
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
if (CubDebug(error = cudaSetDevice(device))) break;
// Allocate
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
// Insert into live blocks
live_blocks.insert(search_key);
if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
} while(0);
// Unlock
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Copy device pointer to output parameter (NULL on error)
*d_ptr = search_key.d_ptr;
// Attempt to revert back to previous device if necessary
if (entrypoint_device != INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Provides a suitable allocation of device memory for the given size on the current device
*/
cudaError_t DeviceAllocate(
void** d_ptr,
size_t bytes)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
cudaError_t error = cudaSuccess;
do {
int current_device;
if (CubDebug(error = cudaGetDevice(&current_device))) break;
if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
} while(0);
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
*/
cudaError_t DeviceFree(
void* d_ptr,
int device)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
bool locked = false;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;
BlockDescriptor search_key(d_ptr, device);
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
do {
// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr == live_blocks.end())
{
// Cannot find pointer
if (CubDebug(error = cudaErrorUnknown)) break;
}
else
{
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
// Check if we should keep the returned allocation
if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
{
// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device] += search_key.bytes;
if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
else
{
// Free the returned allocation. Unlock.
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Set to specified device
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
if (CubDebug(error = cudaSetDevice(device))) break;
// Free device memory
if (CubDebug(error = cudaFree(d_ptr))) break;
if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
}
}
} while (0);
// Unlock
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Attempt to revert back to entry-point device if necessary
if (entrypoint_device != INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Frees a live allocation of device memory on the current device, returning it to the allocator
*/
cudaError_t DeviceFree(
void* d_ptr)
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
int current_device;
cudaError_t error = cudaSuccess;
do {
if (CubDebug(error = cudaGetDevice(&current_device))) break;
if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
} while(0);
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Frees all cached device allocations on all devices
*/
cudaError_t FreeAllCached()
{
#ifdef __CUDA_ARCH__
// Caching functionality only defined on host
return CubDebug(cudaErrorInvalidConfiguration);
#else
cudaError_t error = cudaSuccess;
bool locked = false;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
int current_device = INVALID_DEVICE_ORDINAL;
// Lock
if (!locked) {
Lock(&spin_lock);
locked = true;
}
while (!cached_blocks.empty())
{
// Get first block
CachedBlocks::iterator begin = cached_blocks.begin();
// Get entry-point device ordinal if necessary
if (entrypoint_device == INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
}
// Set current device ordinal if necessary
if (begin->device != current_device)
{
if (CubDebug(error = cudaSetDevice(begin->device))) break;
current_device = begin->device;
}
// Free device memory
if (CubDebug(error = cudaFree(begin->d_ptr))) break;
// Reduce balance and erase entry
cached_bytes[current_device] -= begin->bytes;
cached_blocks.erase(begin);
if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
}
// Unlock
if (locked) {
Unlock(&spin_lock);
locked = false;
}
// Attempt to revert back to entry-point device if necessary
if (entrypoint_device != INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
#endif // __CUDA_ARCH__
}
/**
* \brief Destructor
*/
virtual ~CachingDeviceAllocator()
{
if (!skip_cleanup)
FreeAllCached();
}
};
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,295 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Static architectural properties by SM version.
*/
/******************************************************************************
* Static architectural properties by SM version.
*
* "Device" reflects the PTX architecture targeted by the active compiler
* pass. It provides useful compile-time statics within device code. E.g.,:
*
* __shared__ int[Device::WARP_THREADS];
*
* int padded_offset = threadIdx.x + (threadIdx.x >> Device::LOG_SMEM_BANKS);
*
******************************************************************************/
#pragma once
#include "util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
#ifndef __CUDA_ARCH__
#define CUB_PTX_ARCH 0
#else
#define CUB_PTX_ARCH __CUDA_ARCH__
#endif
/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API.
#if !defined(__CUDA_ARCH__) || defined(CUB_CDP)
#define CUB_RUNTIME_ENABLED
#endif
/// Execution space for destructors
#if ((CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH < 200))
#define CUB_DESTRUCTOR __host__
#else
#define CUB_DESTRUCTOR __host__ __device__
#endif
/**
* \brief Structure for statically reporting CUDA device properties, parameterized by SM architecture.
*
* The default specialization is for SM10.
*/
template <int SM_ARCH>
struct ArchProps
{
enum
{
LOG_WARP_THREADS =
5, /// Log of the number of threads per warp
WARP_THREADS =
1 << LOG_WARP_THREADS, /// Number of threads per warp
LOG_SMEM_BANKS =
4, /// Log of the number of smem banks
SMEM_BANKS =
1 << LOG_SMEM_BANKS, /// The number of smem banks
SMEM_BANK_BYTES =
4, /// Size of smem bank words
SMEM_BYTES =
16 * 1024, /// Maximum SM shared memory
SMEM_ALLOC_UNIT =
512, /// Smem allocation size in bytes
REGS_BY_BLOCK =
true, /// Whether or not the architecture allocates registers by block (or by warp)
REG_ALLOC_UNIT =
256, /// Number of registers allocated at a time per block (or by warp)
WARP_ALLOC_UNIT =
2, /// Granularity of warps for which registers are allocated
MAX_SM_THREADS =
768, /// Maximum number of threads per SM
MAX_SM_THREADBLOCKS =
8, /// Maximum number of thread blocks per SM
MAX_BLOCK_THREADS =
512, /// Maximum number of thread per thread block
MAX_SM_REGISTERS =
8 * 1024, /// Maximum number of registers per SM
};
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Architecture properties for SM30
*/
template <>
struct ArchProps<300>
{
enum
{
LOG_WARP_THREADS = 5, // 32 threads per warp
WARP_THREADS = 1 << LOG_WARP_THREADS,
LOG_SMEM_BANKS = 5, // 32 banks
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
SMEM_BANK_BYTES = 4, // 4 byte bank words
SMEM_BYTES = 48 * 1024, // 48KB shared memory
SMEM_ALLOC_UNIT = 256, // 256B smem allocation segment size
REGS_BY_BLOCK = false, // Allocates registers by warp
REG_ALLOC_UNIT = 256, // 256 registers allocated at a time per warp
WARP_ALLOC_UNIT = 4, // Registers are allocated at a granularity of every 4 warps per threadblock
MAX_SM_THREADS = 2048, // 2K max threads per SM
MAX_SM_THREADBLOCKS = 16, // 16 max threadblocks per SM
MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
MAX_SM_REGISTERS = 64 * 1024, // 64K max registers per SM
};
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
target.template Callback<ArchProps>();
}
};
/**
* Architecture properties for SM20
*/
template <>
struct ArchProps<200>
{
enum
{
LOG_WARP_THREADS = 5, // 32 threads per warp
WARP_THREADS = 1 << LOG_WARP_THREADS,
LOG_SMEM_BANKS = 5, // 32 banks
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
SMEM_BANK_BYTES = 4, // 4 byte bank words
SMEM_BYTES = 48 * 1024, // 48KB shared memory
SMEM_ALLOC_UNIT = 128, // 128B smem allocation segment size
REGS_BY_BLOCK = false, // Allocates registers by warp
REG_ALLOC_UNIT = 64, // 64 registers allocated at a time per warp
WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
MAX_SM_THREADS = 1536, // 1536 max threads per SM
MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
MAX_BLOCK_THREADS = 1024, // 1024 max threads per threadblock
MAX_SM_REGISTERS = 32 * 1024, // 32K max registers per SM
};
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
if (sm_version > 200) {
ArchProps<300>::Callback(target, sm_version);
} else {
target.template Callback<ArchProps>();
}
}
};
/**
* Architecture properties for SM12
*/
template <>
struct ArchProps<120>
{
enum
{
LOG_WARP_THREADS = 5, // 32 threads per warp
WARP_THREADS = 1 << LOG_WARP_THREADS,
LOG_SMEM_BANKS = 4, // 16 banks
SMEM_BANKS = 1 << LOG_SMEM_BANKS,
SMEM_BANK_BYTES = 4, // 4 byte bank words
SMEM_BYTES = 16 * 1024, // 16KB shared memory
SMEM_ALLOC_UNIT = 512, // 512B smem allocation segment size
REGS_BY_BLOCK = true, // Allocates registers by threadblock
REG_ALLOC_UNIT = 512, // 512 registers allocated at time per threadblock
WARP_ALLOC_UNIT = 2, // Registers are allocated at a granularity of every 2 warps per threadblock
MAX_SM_THREADS = 1024, // 1024 max threads per SM
MAX_SM_THREADBLOCKS = 8, // 8 max threadblocks per SM
MAX_BLOCK_THREADS = 512, // 512 max threads per threadblock
MAX_SM_REGISTERS = 16 * 1024, // 16K max registers per SM
};
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
if (sm_version > 120) {
ArchProps<200>::Callback(target, sm_version);
} else {
target.template Callback<ArchProps>();
}
}
};
/**
* Architecture properties for SM10. Derives from the default ArchProps specialization.
*/
template <>
struct ArchProps<100> : ArchProps<0>
{
// Callback utility
template <typename T>
static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
{
if (sm_version > 100) {
ArchProps<120>::Callback(target, sm_version);
} else {
target.template Callback<ArchProps>();
}
}
};
/**
* Architecture properties for SM35
*/
template <>
struct ArchProps<350> : ArchProps<300> {}; // Derives from SM30
/**
* Architecture properties for SM21
*/
template <>
struct ArchProps<210> : ArchProps<200> {}; // Derives from SM20
/**
* Architecture properties for SM13
*/
template <>
struct ArchProps<130> : ArchProps<120> {}; // Derives from SM12
/**
* Architecture properties for SM11
*/
template <>
struct ArchProps<110> : ArchProps<100> {}; // Derives from SM10
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief The architectural properties for the PTX version targeted by the active compiler pass.
*/
struct PtxArchProps : ArchProps<CUB_PTX_ARCH> {};
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,115 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Error and event logging routines.
*
* The following macros definitions are supported:
* - \p CUB_LOG. Simple event messages are printed to \p stdout.
*/
#pragma once
#include <stdio.h>
#include "util_namespace.cuh"
#include "util_arch.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/// CUB error reporting macro (prints error messages to stderr)
#if (defined(DEBUG) || defined(_DEBUG))
#define CUB_STDERR
#endif
/**
* \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
*
* \return The CUDA error.
*/
__host__ __device__ __forceinline__ cudaError_t Debug(
cudaError_t error,
const char* filename,
int line)
{
#ifdef CUB_STDERR
if (error)
{
#if (CUB_PTX_ARCH == 0)
fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
fflush(stderr);
#elif (CUB_PTX_ARCH >= 200)
printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
#endif
}
#endif
return error;
}
/**
* \brief Debug macro
*/
#define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
/**
* \brief Debug macro with exit
*/
#define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
/**
* \brief Log macro for printf statements.
*/
#if (CUB_PTX_ARCH == 0)
#define CubLog(format, ...) printf(format,__VA_ARGS__);
#elif (CUB_PTX_ARCH >= 200)
#define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
#endif
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,378 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Properties of a given CUDA device and the corresponding PTX bundle
*/
#pragma once
#include "util_arch.cuh"
#include "util_debug.cuh"
#include "util_namespace.cuh"
#include "util_macro.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
*/
template <typename T>
__global__ void EmptyKernel(void) { }
/**
* Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
*/
template <int ALLOCATIONS>
__host__ __device__ __forceinline__
cudaError_t AliasTemporaries(
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation
void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed
size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed
{
const int ALIGN_BYTES = 256;
const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
// Compute exclusive prefix sum over allocation requests
size_t bytes_needed = 0;
for (int i = 0; i < ALLOCATIONS; ++i)
{
size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
allocation_sizes[i] = bytes_needed;
bytes_needed += allocation_bytes;
}
// Check if the caller is simply requesting the size of the storage allocation
if (!d_temp_storage)
{
temp_storage_bytes = bytes_needed;
return cudaSuccess;
}
// Check if enough storage provided
if (temp_storage_bytes < bytes_needed)
{
return CubDebug(cudaErrorMemoryAllocation);
}
// Alias
for (int i = 0; i < ALLOCATIONS; ++i)
{
allocations[i] = static_cast<char*>(d_temp_storage) + allocation_sizes[i];
}
return cudaSuccess;
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Retrieves the PTX version (major * 100 + minor * 10)
*/
__host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
{
#ifndef CUB_RUNTIME_ENABLED
// CUDA API calls not supported from this device
return cudaErrorInvalidConfiguration;
#else
cudaError_t error = cudaSuccess;
do
{
cudaFuncAttributes empty_kernel_attrs;
if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
ptx_version = empty_kernel_attrs.ptxVersion * 10;
}
while (0);
return error;
#endif
}
/**
* Synchronize the stream if specified
*/
__host__ __device__ __forceinline__
static cudaError_t SyncStream(cudaStream_t stream)
{
#ifndef __CUDA_ARCH__
return cudaStreamSynchronize(stream);
#else
// Device can't yet sync on a specific stream
return cudaDeviceSynchronize();
#endif
}
/**
* \brief Properties of a given CUDA device and the corresponding PTX bundle
*/
class Device
{
private:
/// Type definition of the EmptyKernel kernel entry point
typedef void (*EmptyKernelPtr)();
/// Force EmptyKernel<void> to be generated if this class is used
__host__ __device__ __forceinline__
EmptyKernelPtr Empty()
{
return EmptyKernel<void>;
}
public:
// Version information
int sm_version; ///< SM version of target device (SM version X.YZ in XYZ integer form)
int ptx_version; ///< Bundled PTX version for target device (PTX version X.YZ in XYZ integer form)
// Target device properties
int sm_count; ///< Number of SMs
int warp_threads; ///< Number of threads per warp
int smem_bank_bytes; ///< Number of bytes per SM bank
int smem_banks; ///< Number of smem banks
int smem_bytes; ///< Smem bytes per SM
int smem_alloc_unit; ///< Smem segment size
bool regs_by_block; ///< Whether registers are allocated by threadblock (or by warp)
int reg_alloc_unit; ///< Granularity of register allocation within the SM
int warp_alloc_unit; ///< Granularity of warp allocation within the SM
int max_sm_threads; ///< Maximum number of threads per SM
int max_sm_blocks; ///< Maximum number of threadblocks per SM
int max_block_threads; ///< Maximum number of threads per threadblock
int max_sm_registers; ///< Maximum number of registers per SM
int max_sm_warps; ///< Maximum number of warps per SM
/**
* Callback for initializing device properties
*/
template <typename ArchProps>
__host__ __device__ __forceinline__ void Callback()
{
warp_threads = ArchProps::WARP_THREADS;
smem_bank_bytes = ArchProps::SMEM_BANK_BYTES;
smem_banks = ArchProps::SMEM_BANKS;
smem_bytes = ArchProps::SMEM_BYTES;
smem_alloc_unit = ArchProps::SMEM_ALLOC_UNIT;
regs_by_block = ArchProps::REGS_BY_BLOCK;
reg_alloc_unit = ArchProps::REG_ALLOC_UNIT;
warp_alloc_unit = ArchProps::WARP_ALLOC_UNIT;
max_sm_threads = ArchProps::MAX_SM_THREADS;
max_sm_blocks = ArchProps::MAX_SM_THREADBLOCKS;
max_block_threads = ArchProps::MAX_BLOCK_THREADS;
max_sm_registers = ArchProps::MAX_SM_REGISTERS;
max_sm_warps = max_sm_threads / warp_threads;
}
public:
/**
* Initializer. Properties are retrieved for the specified GPU ordinal.
*/
__host__ __device__ __forceinline__
cudaError_t Init(int device_ordinal)
{
#ifndef CUB_RUNTIME_ENABLED
// CUDA API calls not supported from this device
return CubDebug(cudaErrorInvalidConfiguration);
#else
cudaError_t error = cudaSuccess;
do
{
// Fill in SM version
int major, minor;
if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
sm_version = major * 100 + minor * 10;
// Fill in static SM properties
// Initialize our device properties via callback from static device properties
ArchProps<100>::Callback(*this, sm_version);
// Fill in SM count
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
// Fill in PTX version
#if CUB_PTX_ARCH > 0
ptx_version = CUB_PTX_ARCH;
#else
if (CubDebug(error = PtxVersion(ptx_version))) break;
#endif
}
while (0);
return error;
#endif
}
/**
* Initializer. Properties are retrieved for the current GPU ordinal.
*/
__host__ __device__ __forceinline__
cudaError_t Init()
{
#ifndef CUB_RUNTIME_ENABLED
// CUDA API calls not supported from this device
return CubDebug(cudaErrorInvalidConfiguration);
#else
cudaError_t error = cudaSuccess;
do
{
int device_ordinal;
if ((error = CubDebug(cudaGetDevice(&device_ordinal)))) break;
if ((error = Init(device_ordinal))) break;
}
while (0);
return error;
#endif
}
/**
* Computes maximum SM occupancy in thread blocks for the given kernel
*/
template <typename KernelPtr>
__host__ __device__ __forceinline__
cudaError_t MaxSmOccupancy(
int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
int block_threads) ///< [in] Number of threads per thread block
{
#ifndef CUB_RUNTIME_ENABLED
// CUDA API calls not supported from this device
return CubDebug(cudaErrorInvalidConfiguration);
#else
cudaError_t error = cudaSuccess;
do
{
// Get kernel attributes
cudaFuncAttributes kernel_attrs;
if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
// Number of warps per threadblock
int block_warps = (block_threads + warp_threads - 1) / warp_threads;
// Max warp occupancy
int max_warp_occupancy = (block_warps > 0) ?
max_sm_warps / block_warps :
max_sm_blocks;
// Maximum register occupancy
int max_reg_occupancy;
if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
{
// Prevent divide-by-zero
max_reg_occupancy = max_sm_blocks;
}
else if (regs_by_block)
{
// Allocates registers by threadblock
int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
max_reg_occupancy = max_sm_registers / block_regs;
}
else
{
// Allocates registers by warp
int sm_sides = warp_alloc_unit;
int sm_registers_per_side = max_sm_registers / sm_sides;
int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
int warps_per_side = sm_registers_per_side / regs_per_warp;
int warps = warps_per_side * sm_sides;
max_reg_occupancy = warps / block_warps;
}
// Shared memory per threadblock
int block_allocated_smem = CUB_ROUND_UP_NEAREST(
kernel_attrs.sharedSizeBytes,
smem_alloc_unit);
// Max shared memory occupancy
int max_smem_occupancy = (block_allocated_smem > 0) ?
(smem_bytes / block_allocated_smem) :
max_sm_blocks;
// Max occupancy
max_sm_occupancy = CUB_MIN(
CUB_MIN(max_sm_blocks, max_warp_occupancy),
CUB_MIN(max_smem_occupancy, max_reg_occupancy));
// printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d)", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
} while (0);
return error;
#endif
}
};
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,718 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Random-access iterator types
*/
#pragma once
#include "thread/thread_load.cuh"
#include "util_device.cuh"
#include "util_debug.cuh"
#include "util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/******************************************************************************
* Texture references
*****************************************************************************/
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
// Anonymous namespace
namespace {
/// Templated texture reference type
template <typename T>
struct TexIteratorRef
{
// Texture reference type
typedef texture<T, cudaTextureType1D, cudaReadModeElementType> TexRef;
static TexRef ref;
/**
* Bind texture
*/
static cudaError_t BindTexture(void *d_in)
{
cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<T>();
if (d_in)
return (CubDebug(cudaBindTexture(NULL, ref, d_in, tex_desc)));
return cudaSuccess;
}
/**
* Unbind textures
*/
static cudaError_t UnbindTexture()
{
return CubDebug(cudaUnbindTexture(ref));
}
};
// Texture reference definitions
template <typename Value>
typename TexIteratorRef<Value>::TexRef TexIteratorRef<Value>::ref = 0;
} // Anonymous namespace
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \addtogroup UtilModule
* @{
*/
/******************************************************************************
* Iterators
*****************************************************************************/
/**
* \brief A simple random-access iterator pointing to a range of constant values
*
* \par Overview
* ConstantIteratorRA is a random-access iterator that when dereferenced, always
* returns the supplied constant of type \p OutputType.
*
* \tparam OutputType The value type of this iterator
*/
template <typename OutputType>
class ConstantIteratorRA
{
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
typedef ConstantIteratorRA self_type;
typedef OutputType value_type;
typedef OutputType reference;
typedef OutputType* pointer;
typedef std::random_access_iterator_tag iterator_category;
typedef int difference_type;
#endif // DOXYGEN_SHOULD_SKIP_THIS
private:
OutputType val;
public:
/// Constructor
__host__ __device__ __forceinline__ ConstantIteratorRA(
const OutputType &val) ///< Constant value for the iterator instance to report
:
val(val)
{}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
__host__ __device__ __forceinline__ self_type operator++()
{
self_type i = *this;
return i;
}
__host__ __device__ __forceinline__ self_type operator++(int junk)
{
return *this;
}
__host__ __device__ __forceinline__ reference operator*()
{
return val;
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator+(SizeT n)
{
return ConstantIteratorRA(val);
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator-(SizeT n)
{
return ConstantIteratorRA(val);
}
template <typename SizeT>
__host__ __device__ __forceinline__ reference operator[](SizeT n)
{
return ConstantIteratorRA(val);
}
__host__ __device__ __forceinline__ pointer operator->()
{
return &val;
}
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (val == rhs.val);
}
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (val != rhs.val);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
};
/**
* \brief A simple random-access transform iterator for applying a transformation operator.
*
* \par Overview
* TransformIteratorRA is a random-access iterator that wraps both a native
* device pointer of type <tt>InputType*</tt> and a unary conversion functor of
* type \p ConversionOp. \p OutputType references are made by pulling \p InputType
* values through the \p ConversionOp instance.
*
* \tparam InputType The value type of the pointer being wrapped
* \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p OutputType. Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
* \tparam OutputType The value type of this iterator
*/
template <typename OutputType, typename ConversionOp, typename InputType>
class TransformIteratorRA
{
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
typedef TransformIteratorRA self_type;
typedef OutputType value_type;
typedef OutputType reference;
typedef OutputType* pointer;
typedef std::random_access_iterator_tag iterator_category;
typedef int difference_type;
#endif // DOXYGEN_SHOULD_SKIP_THIS
private:
ConversionOp conversion_op;
InputType* ptr;
public:
/**
* \brief Constructor
* @param ptr Native pointer to wrap
* @param conversion_op Binary transformation functor
*/
__host__ __device__ __forceinline__ TransformIteratorRA(InputType* ptr, ConversionOp conversion_op) :
conversion_op(conversion_op),
ptr(ptr) {}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
__host__ __device__ __forceinline__ self_type operator++()
{
self_type i = *this;
ptr++;
return i;
}
__host__ __device__ __forceinline__ self_type operator++(int junk)
{
ptr++;
return *this;
}
__host__ __device__ __forceinline__ reference operator*()
{
return conversion_op(*ptr);
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator+(SizeT n)
{
TransformIteratorRA retval(ptr + n, conversion_op);
return retval;
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator-(SizeT n)
{
TransformIteratorRA retval(ptr - n, conversion_op);
return retval;
}
template <typename SizeT>
__host__ __device__ __forceinline__ reference operator[](SizeT n)
{
return conversion_op(ptr[n]);
}
__host__ __device__ __forceinline__ pointer operator->()
{
return &conversion_op(*ptr);
}
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (ptr == rhs.ptr);
}
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (ptr != rhs.ptr);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
};
/**
* \brief A simple random-access iterator for loading primitive values through texture cache.
*
* \par Overview
* TexIteratorRA is a random-access iterator that wraps a native
* device pointer of type <tt>T*</tt>. References made through TexIteratorRA
* causes values to be pulled through texture cache.
*
* \par Usage Considerations
* - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
* - Only one TexIteratorRA or TexIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
*
* \tparam InputType The value type of the pointer being wrapped
* \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p OutputType. Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
* \tparam OutputType The value type of this iterator
*/
template <typename T>
class TexIteratorRA
{
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
typedef TexIteratorRA self_type;
typedef T value_type;
typedef T reference;
typedef T* pointer;
typedef std::random_access_iterator_tag iterator_category;
typedef int difference_type;
#endif // DOXYGEN_SHOULD_SKIP_THIS
/// Tag identifying iterator type as being texture-bindable
typedef void TexBindingTag;
private:
T* ptr;
size_t tex_align_offset;
cudaTextureObject_t tex_obj;
public:
/**
* \brief Constructor
*/
__host__ __device__ __forceinline__ TexIteratorRA()
:
ptr(NULL),
tex_align_offset(0),
tex_obj(0)
{}
/// \brief Bind iterator to texture reference
cudaError_t BindTexture(
T *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
size_t bytes, ///< Number of items
size_t tex_align_offset = 0) ///< Offset (in items) from ptr denoting the position of the iterator
{
this->ptr = ptr;
this->tex_align_offset = tex_align_offset;
int ptx_version;
cudaError_t error = cudaSuccess;
if (CubDebug(error = PtxVersion(ptx_version))) return error;
if (ptx_version >= 300)
{
// Use texture object
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<T>();
cudaResourceDesc res_desc;
cudaTextureDesc tex_desc;
memset(&res_desc, 0, sizeof(cudaResourceDesc));
memset(&tex_desc, 0, sizeof(cudaTextureDesc));
res_desc.resType = cudaResourceTypeLinear;
res_desc.res.linear.devPtr = ptr;
res_desc.res.linear.desc = channel_desc;
res_desc.res.linear.sizeInBytes = bytes;
tex_desc.readMode = cudaReadModeElementType;
return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
}
else
{
// Use texture reference
return TexIteratorRef<T>::BindTexture(ptr);
}
}
/// \brief Unbind iterator to texture reference
cudaError_t UnbindTexture()
{
int ptx_version;
cudaError_t error = cudaSuccess;
if (CubDebug(error = PtxVersion(ptx_version))) return error;
if (ptx_version < 300)
{
// Use texture reference
return TexIteratorRef<T>::UnbindTexture();
}
else
{
// Use texture object
return cudaDestroyTextureObject(tex_obj);
}
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
__host__ __device__ __forceinline__ self_type operator++()
{
self_type i = *this;
ptr++;
tex_align_offset++;
return i;
}
__host__ __device__ __forceinline__ self_type operator++(int junk)
{
ptr++;
tex_align_offset++;
return *this;
}
__host__ __device__ __forceinline__ reference operator*()
{
#if (CUB_PTX_ARCH == 0)
// Simply dereference the pointer on the host
return *ptr;
#elif (CUB_PTX_ARCH < 300)
// Use the texture reference
return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset);
#else
// Use the texture object
return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
#endif
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator+(SizeT n)
{
TexIteratorRA retval;
retval.ptr = ptr + n;
retval.tex_align_offset = tex_align_offset + n;
return retval;
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator-(SizeT n)
{
TexIteratorRA retval;
retval.ptr = ptr - n;
retval.tex_align_offset = tex_align_offset - n;
return retval;
}
template <typename SizeT>
__host__ __device__ __forceinline__ reference operator[](SizeT n)
{
#if (CUB_PTX_ARCH == 0)
// Simply dereference the pointer on the host
return ptr[n];
#elif (CUB_PTX_ARCH < 300)
// Use the texture reference
return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset + n);
#else
// Use the texture object
return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
#endif
}
__host__ __device__ __forceinline__ pointer operator->()
{
#if (CUB_PTX_ARCH == 0)
// Simply dereference the pointer on the host
return &(*ptr);
#elif (CUB_PTX_ARCH < 300)
// Use the texture reference
return &(tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset));
#else
// Use the texture object
return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
#endif
}
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (ptr == rhs.ptr);
}
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (ptr != rhs.ptr);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
};
/**
* \brief A simple random-access transform iterator for loading primitive values through texture cache and and subsequently applying a transformation operator.
*
* \par Overview
* TexTransformIteratorRA is a random-access iterator that wraps both a native
* device pointer of type <tt>InputType*</tt> and a unary conversion functor of
* type \p ConversionOp. \p OutputType references are made by pulling \p InputType
* values through the texture cache and then transformed them using the
* \p ConversionOp instance.
*
* \par Usage Considerations
* - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
* - Only one TexIteratorRA or TexTransformIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
*
* \tparam InputType The value type of the pointer being wrapped
* \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p OutputType. Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
* \tparam OutputType The value type of this iterator
*/
template <typename OutputType, typename ConversionOp, typename InputType>
class TexTransformIteratorRA
{
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
typedef TexTransformIteratorRA self_type;
typedef OutputType value_type;
typedef OutputType reference;
typedef OutputType* pointer;
typedef std::random_access_iterator_tag iterator_category;
typedef int difference_type;
#endif // DOXYGEN_SHOULD_SKIP_THIS
/// Tag identifying iterator type as being texture-bindable
typedef void TexBindingTag;
private:
ConversionOp conversion_op;
InputType* ptr;
size_t tex_align_offset;
cudaTextureObject_t tex_obj;
public:
/**
* \brief Constructor
*/
TexTransformIteratorRA(
ConversionOp conversion_op) ///< Binary transformation functor
:
conversion_op(conversion_op),
ptr(NULL),
tex_align_offset(0),
tex_obj(0)
{}
/// \brief Bind iterator to texture reference
cudaError_t BindTexture(
InputType* ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
size_t bytes, ///< Number of items
size_t tex_align_offset = 0) ///< Offset (in items) from ptr denoting the position of the iterator
{
this->ptr = ptr;
this->tex_align_offset = tex_align_offset;
int ptx_version;
cudaError_t error = cudaSuccess;
if (CubDebug(error = PtxVersion(ptx_version))) return error;
if (ptx_version >= 300)
{
// Use texture object
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<InputType>();
cudaResourceDesc res_desc;
cudaTextureDesc tex_desc;
memset(&res_desc, 0, sizeof(cudaResourceDesc));
memset(&tex_desc, 0, sizeof(cudaTextureDesc));
res_desc.resType = cudaResourceTypeLinear;
res_desc.res.linear.devPtr = ptr;
res_desc.res.linear.desc = channel_desc;
res_desc.res.linear.sizeInBytes = bytes;
tex_desc.readMode = cudaReadModeElementType;
return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
}
else
{
// Use texture reference
return TexIteratorRef<InputType>::BindTexture(ptr);
}
}
/// \brief Unbind iterator to texture reference
cudaError_t UnbindTexture()
{
int ptx_version;
cudaError_t error = cudaSuccess;
if (CubDebug(error = PtxVersion(ptx_version))) return error;
if (ptx_version >= 300)
{
// Use texture object
return cudaDestroyTextureObject(tex_obj);
}
else
{
// Use texture reference
return TexIteratorRef<InputType>::UnbindTexture();
}
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
__host__ __device__ __forceinline__ self_type operator++()
{
self_type i = *this;
ptr++;
tex_align_offset++;
return i;
}
__host__ __device__ __forceinline__ self_type operator++(int junk)
{
ptr++;
tex_align_offset++;
return *this;
}
__host__ __device__ __forceinline__ reference operator*()
{
#if (CUB_PTX_ARCH == 0)
// Simply dereference the pointer on the host
return conversion_op(*ptr);
#elif (CUB_PTX_ARCH < 300)
// Use the texture reference
return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
#else
// Use the texture object
return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
#endif
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator+(SizeT n)
{
TexTransformIteratorRA retval(conversion_op);
retval.ptr = ptr + n;
retval.tex_align_offset = tex_align_offset + n;
return retval;
}
template <typename SizeT>
__host__ __device__ __forceinline__ self_type operator-(SizeT n)
{
TexTransformIteratorRA retval(conversion_op);
retval.ptr = ptr - n;
retval.tex_align_offset = tex_align_offset - n;
return retval;
}
template <typename SizeT>
__host__ __device__ __forceinline__ reference operator[](SizeT n)
{
#if (CUB_PTX_ARCH == 0)
// Simply dereference the pointer on the host
return conversion_op(ptr[n]);
#elif (CUB_PTX_ARCH < 300)
// Use the texture reference
return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset + n));
#else
// Use the texture object
return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
#endif
}
__host__ __device__ __forceinline__ pointer operator->()
{
#if (CUB_PTX_ARCH == 0)
// Simply dereference the pointer on the host
return &conversion_op(*ptr);
#elif (CUB_PTX_ARCH < 300)
// Use the texture reference
return &conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
#else
// Use the texture object
return &conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
#endif
}
__host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
{
return (ptr == rhs.ptr);
}
__host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
{
return (ptr != rhs.ptr);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
};
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,107 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Common C/C++ macro utilities
******************************************************************************/
#pragma once
#include "util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/**
* Align struct
*/
#if defined(_WIN32) || defined(_WIN64)
#define CUB_ALIGN(bytes) __declspec(align(32))
#else
#define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
#endif
/**
* Select maximum(a, b)
*/
#define CUB_MAX(a, b) (((a) > (b)) ? (a) : (b))
/**
* Select minimum(a, b)
*/
#define CUB_MIN(a, b) (((a) < (b)) ? (a) : (b))
/**
* Quotient of x/y rounded down to nearest integer
*/
#define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
/**
* Quotient of x/y rounded up to nearest integer
*/
#define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
/**
* x rounded up to the nearest multiple of y
*/
#define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
/**
* x rounded down to the nearest multiple of y
*/
#define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
/**
* Return character string for given type
*/
#define CUB_TYPE_STRING(type) ""#type
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#define CUB_CAT_(a, b) a ## b
#define CUB_CAT(a, b) CUB_CAT_(a, b)
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* Static assert
*/
#define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,41 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Place-holder for prefixing the cub namespace
*/
#pragma once
// For example:
//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
//#define CUB_NS_POSTFIX } }
#define CUB_NS_PREFIX
#define CUB_NS_POSTFIX

View File

@ -0,0 +1,380 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* PTX intrinsics
*/
#pragma once
#include "util_type.cuh"
#include "util_arch.cuh"
#include "util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/******************************************************************************
* PTX helper macros
******************************************************************************/
/**
* Register modifier for pointer-types (for inlining PTX assembly)
*/
#if defined(_WIN64) || defined(__LP64__)
#define __CUB_LP64__ 1
// 64-bit register modifier for inlined asm
#define _CUB_ASM_PTR_ "l"
#define _CUB_ASM_PTR_SIZE_ "u64"
#else
#define __CUB_LP64__ 0
// 32-bit register modifier for inlined asm
#define _CUB_ASM_PTR_ "r"
#define _CUB_ASM_PTR_SIZE_ "u32"
#endif
/******************************************************************************
* Inlined PTX intrinsics
******************************************************************************/
/**
* Shift-right then add. Returns (x >> shift) + addend.
*/
__device__ __forceinline__ unsigned int SHR_ADD(
unsigned int x,
unsigned int shift,
unsigned int addend)
{
unsigned int ret;
#if __CUDA_ARCH__ >= 200
asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
"=r"(ret) : "r"(x), "r"(shift), "r"(addend));
#else
ret = (x >> shift) + addend;
#endif
return ret;
}
/**
* Shift-left then add. Returns (x << shift) + addend.
*/
__device__ __forceinline__ unsigned int SHL_ADD(
unsigned int x,
unsigned int shift,
unsigned int addend)
{
unsigned int ret;
#if __CUDA_ARCH__ >= 200
asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
"=r"(ret) : "r"(x), "r"(shift), "r"(addend));
#else
ret = (x << shift) + addend;
#endif
return ret;
}
/**
* Bitfield-extract.
*/
template <typename UnsignedBits>
__device__ __forceinline__ unsigned int BFE(
UnsignedBits source,
unsigned int bit_start,
unsigned int num_bits)
{
unsigned int bits;
#if __CUDA_ARCH__ >= 200
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
#else
const unsigned int MASK = (1 << num_bits) - 1;
bits = (source >> bit_start) & MASK;
#endif
return bits;
}
/**
* Bitfield-extract for 64-bit types.
*/
__device__ __forceinline__ unsigned int BFE(
unsigned long long source,
unsigned int bit_start,
unsigned int num_bits)
{
const unsigned long long MASK = (1ull << num_bits) - 1;
return (source >> bit_start) & MASK;
}
/**
* Bitfield insert. Inserts the first num_bits of y into x starting at bit_start
*/
__device__ __forceinline__ void BFI(
unsigned int &ret,
unsigned int x,
unsigned int y,
unsigned int bit_start,
unsigned int num_bits)
{
#if __CUDA_ARCH__ >= 200
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
#else
// TODO
#endif
}
/**
* Three-operand add
*/
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
{
#if __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
#else
x = x + y + z;
#endif
return x;
}
/**
* Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and
* reassemble them into a 32-bit destination register
*/
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
{
int ret;
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
/**
* Sync-threads barrier.
*/
__device__ __forceinline__ void BAR(int count)
{
asm volatile("bar.sync 1, %0;" : : "r"(count));
}
/**
* Floating point multiply. (Mantissa LSB rounds towards zero.)
*/
__device__ __forceinline__ float FMUL_RZ(float a, float b)
{
float d;
asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
return d;
}
/**
* Floating point multiply-add. (Mantissa LSB rounds towards zero.)
*/
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
{
float d;
asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
return d;
}
/**
* Terminates the calling thread
*/
__device__ __forceinline__ void ThreadExit() {
asm("exit;");
}
/**
* Returns the warp lane ID of the calling thread
*/
__device__ __forceinline__ unsigned int LaneId()
{
unsigned int ret;
asm("mov.u32 %0, %laneid;" : "=r"(ret) );
return ret;
}
/**
* Returns the warp ID of the calling thread
*/
__device__ __forceinline__ unsigned int WarpId()
{
unsigned int ret;
asm("mov.u32 %0, %warpid;" : "=r"(ret) );
return ret;
}
/**
* Returns the warp lane mask of all lanes less than the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskLt()
{
unsigned int ret;
asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
return ret;
}
/**
* Returns the warp lane mask of all lanes less than or equal to the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskLe()
{
unsigned int ret;
asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
return ret;
}
/**
* Returns the warp lane mask of all lanes greater than the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskGt()
{
unsigned int ret;
asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
return ret;
}
/**
* Returns the warp lane mask of all lanes greater than or equal to the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskGe()
{
unsigned int ret;
asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
return ret;
}
/**
* Portable implementation of __all
*/
__device__ __forceinline__ int WarpAll(int cond)
{
#if CUB_PTX_ARCH < 120
__shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
if (LaneId() == 0)
warp_signals[WarpId()] = 1;
if (cond == 0)
warp_signals[WarpId()] = 0;
return warp_signals[WarpId()];
#else
return __all(cond);
#endif
}
/**
* Portable implementation of __any
*/
__device__ __forceinline__ int WarpAny(int cond)
{
#if CUB_PTX_ARCH < 120
__shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
if (LaneId() == 0)
warp_signals[WarpId()] = 0;
if (cond)
warp_signals[WarpId()] = 1;
return warp_signals[WarpId()];
#else
return __any(cond);
#endif
}
/// Generic shuffle-up
template <typename T>
__device__ __forceinline__ T ShuffleUp(
T input, ///< [in] The value to broadcast
int src_offset) ///< [in] The up-offset of the peer to read from
{
enum
{
SHFL_C = 0,
};
typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
#pragma unroll
for (int WORD = 0; WORD < WORDS; ++WORD)
{
unsigned int shuffle_word = input_alias[WORD];
asm(
" shfl.up.b32 %0, %1, %2, %3;"
: "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
output_alias[WORD] = (ShuffleWord) shuffle_word;
}
return output;
}
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,685 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Common type manipulation (metaprogramming) utilities
*/
#pragma once
#include <iostream>
#include <limits>
#include "util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/******************************************************************************
* Type equality
******************************************************************************/
/**
* \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
*/
template <bool IF, typename ThenType, typename ElseType>
struct If
{
/// Conditional type result
typedef ThenType Type; // true
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename ThenType, typename ElseType>
struct If<false, ThenType, ElseType>
{
typedef ElseType Type; // false
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Conditional types
******************************************************************************/
/**
* \brief Type equality test
*/
template <typename A, typename B>
struct Equals
{
enum {
VALUE = 0,
NEGATE = 1
};
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename A>
struct Equals <A, A>
{
enum {
VALUE = 1,
NEGATE = 0
};
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Marker types
******************************************************************************/
/**
* \brief A simple "NULL" marker type
*/
struct NullType
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename T>
__host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; }
#endif // DOXYGEN_SHOULD_SKIP_THIS
};
/**
* \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
*/
template <int A>
struct Int2Type
{
enum {VALUE = A};
};
/******************************************************************************
* Size and alignment
******************************************************************************/
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename T>
struct WordAlignment
{
struct Pad
{
T val;
char byte;
};
enum
{
/// The alignment of T in bytes
ALIGN_BYTES = sizeof(Pad) - sizeof(T)
};
/// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
typedef typename If<(ALIGN_BYTES % 4 == 0),
int,
typename If<(ALIGN_BYTES % 2 == 0),
short,
char>::Type>::Type ShuffleWord;
/// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
typedef typename If<(ALIGN_BYTES % 8 == 0),
long long,
ShuffleWord>::Type VolatileWord;
/// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
typedef typename If<(ALIGN_BYTES % 16 == 0),
longlong2,
typename If<(ALIGN_BYTES % 8 == 0),
long long, // needed to get heterogenous PODs to work on all platforms
ShuffleWord>::Type>::Type DeviceWord;
enum
{
DEVICE_MULTIPLE = sizeof(DeviceWord) / sizeof(T)
};
struct UninitializedBytes
{
char buf[sizeof(T)];
};
struct UninitializedShuffleWords
{
ShuffleWord buf[sizeof(T) / sizeof(ShuffleWord)];
};
struct UninitializedVolatileWords
{
VolatileWord buf[sizeof(T) / sizeof(VolatileWord)];
};
struct UninitializedDeviceWords
{
DeviceWord buf[sizeof(T) / sizeof(DeviceWord)];
};
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Wrapper types
******************************************************************************/
/**
* \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
*/
template <typename T>
struct Uninitialized
{
/// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
typedef typename WordAlignment<T>::DeviceWord DeviceWord;
enum
{
WORDS = sizeof(T) / sizeof(DeviceWord)
};
/// Backing storage
DeviceWord storage[WORDS];
/// Alias
__host__ __device__ __forceinline__ T& Alias()
{
return reinterpret_cast<T&>(*this);
}
};
/**
* \brief A wrapper for passing simple static arrays as kernel parameters
*/
template <typename T, int COUNT>
struct ArrayWrapper
{
/// Static array of type \p T
T array[COUNT];
};
/**
* \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
*
* Many multi-pass computations require a pair of "ping-pong" storage
* buffers (e.g., one for reading from and the other for writing to, and then
* vice-versa for the subsequent pass). This structure wraps a set of device
* buffers and a "selector" member to track which is "current".
*/
template <typename T>
struct DoubleBuffer
{
/// Pair of device buffer pointers
T *d_buffers[2];
/// Selector into \p d_buffers (i.e., the active/valid buffer)
int selector;
/// \brief Constructor
__host__ __device__ __forceinline__ DoubleBuffer()
{
selector = 0;
d_buffers[0] = NULL;
d_buffers[1] = NULL;
}
/// \brief Constructor
__host__ __device__ __forceinline__ DoubleBuffer(
T *d_current, ///< The currently valid buffer
T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current
{
selector = 0;
d_buffers[0] = d_current;
d_buffers[1] = d_alternate;
}
/// \brief Return pointer to the currently valid buffer
__host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
};
/******************************************************************************
* Static math
******************************************************************************/
/**
* \brief Statically determine log2(N), rounded up.
*
* For example:
* Log2<8>::VALUE // 3
* Log2<3>::VALUE // 2
*/
template <int N, int CURRENT_VAL = N, int COUNT = 0>
struct Log2
{
/// Static logarithm value
enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE }; // Inductive case
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <int N, int COUNT>
struct Log2<N, 0, COUNT>
{
enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case
COUNT :
COUNT - 1 };
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Statically determine if N is a power-of-two
*/
template <int N>
struct PowerOfTwo
{
enum { VALUE = ((N & (N - 1)) == 0) };
};
/******************************************************************************
* Pointer vs. iterator detection
******************************************************************************/
/**
* \brief Pointer vs. iterator
*/
template <typename Tp>
struct IsPointer
{
enum { VALUE = 0 };
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename Tp>
struct IsPointer<Tp*>
{
enum { VALUE = 1 };
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Qualifier detection
******************************************************************************/
/**
* \brief Volatile modifier test
*/
template <typename Tp>
struct IsVolatile
{
enum { VALUE = 0 };
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename Tp>
struct IsVolatile<Tp volatile>
{
enum { VALUE = 1 };
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Qualifier removal
******************************************************************************/
/**
* \brief Removes \p const and \p volatile qualifiers from type \p Tp.
*
* For example:
* <tt>typename RemoveQualifiers<volatile int>::Type // int;</tt>
*/
template <typename Tp, typename Up = Tp>
struct RemoveQualifiers
{
/// Type without \p const and \p volatile qualifiers
typedef Up Type;
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <typename Tp, typename Up>
struct RemoveQualifiers<Tp, volatile Up>
{
typedef Up Type;
};
template <typename Tp, typename Up>
struct RemoveQualifiers<Tp, const Up>
{
typedef Up Type;
};
template <typename Tp, typename Up>
struct RemoveQualifiers<Tp, const volatile Up>
{
typedef Up Type;
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Typedef-detection
******************************************************************************/
/**
* \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
*/
#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \
template <typename T> \
struct detector_name \
{ \
template <typename C> \
static char& test(typename C::nested_type_name*); \
template <typename> \
static int& test(...); \
enum \
{ \
VALUE = sizeof(test<T>(0)) < sizeof(int) \
}; \
};
/******************************************************************************
* Simple enable-if (similar to Boost)
******************************************************************************/
/**
* \brief Simple enable-if (similar to Boost)
*/
template <bool Condition, class T = void>
struct EnableIf
{
/// Enable-if type for SFINAE dummy variables
typedef T Type;
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <class T>
struct EnableIf<false, T> {};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Typedef-detection
******************************************************************************/
/**
* \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
*/
template <typename T, typename BinaryOp>
struct BinaryOpHasIdxParam
{
private:
template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const> struct SFINAE1 {};
template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)> struct SFINAE2 {};
template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const> struct SFINAE3 {};
template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)> struct SFINAE4 {};
template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const> struct SFINAE5 {};
template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)> struct SFINAE6 {};
template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const> struct SFINAE7 {};
template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)> struct SFINAE8 {};
template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
template <typename BinaryOpT> static int Test(...);
public:
/// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
};
/******************************************************************************
* Simple type traits utilities.
*
* For example:
* Traits<int>::CATEGORY // SIGNED_INTEGER
* Traits<NullType>::NULL_TYPE // true
* Traits<uint4>::CATEGORY // NOT_A_NUMBER
* Traits<uint4>::PRIMITIVE; // false
*
******************************************************************************/
/**
* \brief Basic type traits categories
*/
enum Category
{
NOT_A_NUMBER,
SIGNED_INTEGER,
UNSIGNED_INTEGER,
FLOATING_POINT
};
/**
* \brief Basic type traits
*/
template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits>
struct BaseTraits
{
/// Category
static const Category CATEGORY = _CATEGORY;
enum
{
PRIMITIVE = _PRIMITIVE,
NULL_TYPE = _NULL_TYPE,
};
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Basic type traits (unsigned primitive specialization)
*/
template <typename _UnsignedBits>
struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits>
{
typedef _UnsignedBits UnsignedBits;
static const Category CATEGORY = UNSIGNED_INTEGER;
static const UnsignedBits MIN_KEY = UnsignedBits(0);
static const UnsignedBits MAX_KEY = UnsignedBits(-1);
enum
{
PRIMITIVE = true,
NULL_TYPE = false,
};
static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
{
return key;
}
static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
{
return key;
}
};
/**
* Basic type traits (signed primitive specialization)
*/
template <typename _UnsignedBits>
struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits>
{
typedef _UnsignedBits UnsignedBits;
static const Category CATEGORY = SIGNED_INTEGER;
static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
static const UnsignedBits MIN_KEY = HIGH_BIT;
static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT;
enum
{
PRIMITIVE = true,
NULL_TYPE = false,
};
static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
{
return key ^ HIGH_BIT;
};
static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
{
return key ^ HIGH_BIT;
};
};
/**
* Basic type traits (fp primitive specialization)
*/
template <typename _UnsignedBits>
struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits>
{
typedef _UnsignedBits UnsignedBits;
static const Category CATEGORY = FLOATING_POINT;
static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
static const UnsignedBits MIN_KEY = UnsignedBits(-1);
static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT;
static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
{
UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
return key ^ mask;
};
static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
{
UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
return key ^ mask;
};
enum
{
PRIMITIVE = true,
NULL_TYPE = false,
};
};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Numeric type traits
*/
template <typename T> struct NumericTraits : BaseTraits<NOT_A_NUMBER, false, false, T> {};
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
template <> struct NumericTraits<NullType> : BaseTraits<NOT_A_NUMBER, false, true, NullType> {};
template <> struct NumericTraits<char> : BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {};
template <> struct NumericTraits<signed char> : BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
template <> struct NumericTraits<short> : BaseTraits<SIGNED_INTEGER, true, false, unsigned short> {};
template <> struct NumericTraits<int> : BaseTraits<SIGNED_INTEGER, true, false, unsigned int> {};
template <> struct NumericTraits<long> : BaseTraits<SIGNED_INTEGER, true, false, unsigned long> {};
template <> struct NumericTraits<long long> : BaseTraits<SIGNED_INTEGER, true, false, unsigned long long> {};
template <> struct NumericTraits<unsigned char> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char> {};
template <> struct NumericTraits<unsigned short> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short> {};
template <> struct NumericTraits<unsigned int> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int> {};
template <> struct NumericTraits<unsigned long> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long> {};
template <> struct NumericTraits<unsigned long long> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long> {};
template <> struct NumericTraits<float> : BaseTraits<FLOATING_POINT, true, false, unsigned int> {};
template <> struct NumericTraits<double> : BaseTraits<FLOATING_POINT, true, false, unsigned long long> {};
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Type traits
*/
template <typename T>
struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,166 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Vector type inference utilities
*/
#pragma once
#include <iostream>
#include "util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilModule
* @{
*/
/******************************************************************************
* Vector type inference utilities. For example:
*
* typename VectorHelper<unsigned int, 2>::Type // Aliases uint2
*
******************************************************************************/
/**
* \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the VectorHelper structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
*/
template <typename T, int vec_elements> struct VectorHelper;
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
enum
{
/// The maximum number of elements in CUDA vector types
MAX_VEC_ELEMENTS = 4,
};
/**
* Generic vector-1 type
*/
template <typename T>
struct VectorHelper<T, 1>
{
enum { BUILT_IN = false };
T x;
typedef VectorHelper<T, 1> Type;
};
/**
* Generic vector-2 type
*/
template <typename T>
struct VectorHelper<T, 2>
{
enum { BUILT_IN = false };
T x;
T y;
typedef VectorHelper<T, 2> Type;
};
/**
* Generic vector-3 type
*/
template <typename T>
struct VectorHelper<T, 3>
{
enum { BUILT_IN = false };
T x;
T y;
T z;
typedef VectorHelper<T, 3> Type;
};
/**
* Generic vector-4 type
*/
template <typename T>
struct VectorHelper<T, 4>
{
enum { BUILT_IN = false };
T x;
T y;
T z;
T w;
typedef VectorHelper<T, 4> Type;
};
/**
* Macro for expanding partially-specialized built-in vector types
*/
#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \
template<> struct VectorHelper<base_type, 1> { typedef short_type##1 Type; enum { BUILT_IN = true }; }; \
template<> struct VectorHelper<base_type, 2> { typedef short_type##2 Type; enum { BUILT_IN = true }; }; \
template<> struct VectorHelper<base_type, 3> { typedef short_type##3 Type; enum { BUILT_IN = true }; }; \
template<> struct VectorHelper<base_type, 4> { typedef short_type##4 Type; enum { BUILT_IN = true }; };
// Expand CUDA vector types for built-in primitives
CUB_DEFINE_VECTOR_TYPE(char, char)
CUB_DEFINE_VECTOR_TYPE(signed char, char)
CUB_DEFINE_VECTOR_TYPE(short, short)
CUB_DEFINE_VECTOR_TYPE(int, int)
CUB_DEFINE_VECTOR_TYPE(long, long)
CUB_DEFINE_VECTOR_TYPE(long long, longlong)
CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar)
CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort)
CUB_DEFINE_VECTOR_TYPE(unsigned int, uint)
CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong)
CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
CUB_DEFINE_VECTOR_TYPE(float, float)
CUB_DEFINE_VECTOR_TYPE(double, double)
CUB_DEFINE_VECTOR_TYPE(bool, uchar)
// Undefine macros
#undef CUB_DEFINE_VECTOR_TYPE
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group UtilModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,358 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
*/
#pragma once
#include "../../thread/thread_operators.cuh"
#include "../../util_ptx.cuh"
#include "../../util_type.cuh"
#include "../../util_macro.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
*/
template <
typename T, ///< Data type being reduced
int LOGICAL_WARPS, ///< Number of logical warps entrant
int LOGICAL_WARP_THREADS> ///< Number of threads per logical warp
struct WarpReduceShfl
{
/******************************************************************************
* Constants and typedefs
******************************************************************************/
enum
{
/// The number of warp reduction steps
STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
// The 5-bit SHFL mask for logically splitting warps into sub-segments
SHFL_MASK = (-1 << STEPS) & 31,
// The 5-bit SFHL clamp
SHFL_CLAMP = LOGICAL_WARP_THREADS - 1,
// The packed C argument (mask starts 8 bits up)
SHFL_C = (SHFL_MASK << 8) | SHFL_CLAMP,
};
/// Shared memory storage layout type
typedef NullType TempStorage;
/******************************************************************************
* Thread fields
******************************************************************************/
int warp_id;
int lane_id;
/******************************************************************************
* Construction
******************************************************************************/
/// Constructor
__device__ __forceinline__ WarpReduceShfl(
TempStorage &temp_storage,
int warp_id,
int lane_id)
:
warp_id(warp_id),
lane_id(lane_id)
{}
/******************************************************************************
* Operation
******************************************************************************/
/// Summation (single-SHFL)
template <
bool FULL_WARPS, ///< Whether all lanes in each warp are contributing a valid fold of items
int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane
__device__ __forceinline__ T Sum(
T input, ///< [in] Calling thread's input
int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
Int2Type<true> single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required
{
unsigned int output = reinterpret_cast<unsigned int &>(input);
// Iterate reduction steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
const int OFFSET = 1 << STEP;
if (FULL_WARPS)
{
// Use predicate set from SHFL to guard against invalid peers
asm(
"{"
" .reg .u32 r0;"
" .reg .pred p;"
" shfl.down.b32 r0|p, %1, %2, %3;"
" @p add.u32 r0, r0, %4;"
" mov.u32 %0, r0;"
"}"
: "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output));
}
else
{
// Set range predicate to guard against invalid peers
asm(
"{"
" .reg .u32 r0;"
" .reg .pred p;"
" shfl.down.b32 r0, %1, %2, %3;"
" setp.lt.u32 p, %5, %6;"
" mov.u32 %0, %1;"
" @p add.u32 %0, %1, r0;"
"}"
: "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
}
}
return output;
}
/// Summation (multi-SHFL)
template <
bool FULL_WARPS, ///< Whether all lanes in each warp are contributing a valid fold of items
int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane
__device__ __forceinline__ T Sum(
T input, ///< [in] Calling thread's input
int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
Int2Type<false> single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required
{
// Delegate to generic reduce
return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
}
/// Summation (float)
template <
bool FULL_WARPS, ///< Whether all lanes in each warp are contributing a valid fold of items
int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane
__device__ __forceinline__ float Sum(
float input, ///< [in] Calling thread's input
int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp
{
T output = input;
// Iterate reduction steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
const int OFFSET = 1 << STEP;
if (FULL_WARPS)
{
// Use predicate set from SHFL to guard against invalid peers
asm(
"{"
" .reg .f32 r0;"
" .reg .pred p;"
" shfl.down.b32 r0|p, %1, %2, %3;"
" @p add.f32 r0, r0, %4;"
" mov.f32 %0, r0;"
"}"
: "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output));
}
else
{
// Set range predicate to guard against invalid peers
asm(
"{"
" .reg .f32 r0;"
" .reg .pred p;"
" shfl.down.b32 r0, %1, %2, %3;"
" setp.lt.u32 p, %5, %6;"
" mov.f32 %0, %1;"
" @p add.f32 %0, %0, r0;"
"}"
: "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
}
}
return output;
}
/// Summation (generic)
template <
bool FULL_WARPS, ///< Whether all lanes in each warp are contributing a valid fold of items
int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane
typename _T>
__device__ __forceinline__ _T Sum(
_T input, ///< [in] Calling thread's input
int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp
{
// Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
return Sum<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, single_shfl);
}
/// Reduction
template <
bool FULL_WARPS, ///< Whether all lanes in each warp are contributing a valid fold of items
int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane
typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input
int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output = input;
T temp;
ShuffleWord *temp_alias = reinterpret_cast<ShuffleWord *>(&temp);
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
// Grab addend from peer
const int OFFSET = 1 << STEP;
#pragma unroll
for (int WORD = 0; WORD < WORDS; ++WORD)
{
unsigned int shuffle_word = output_alias[WORD];
asm(
" shfl.down.b32 %0, %1, %2, %3;"
: "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
temp_alias[WORD] = (ShuffleWord) shuffle_word;
}
// Perform reduction op if from a valid peer
if (FULL_WARPS)
{
if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
output = reduction_op(output, temp);
}
else
{
if (((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE) < folded_items_per_warp)
output = reduction_op(output, temp);
}
}
return output;
}
/// Segmented reduction
template <
bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail
typename Flag,
typename ReductionOp>
__device__ __forceinline__ T SegmentedReduce(
T input, ///< [in] Calling thread's input
Flag flag, ///< [in] Whether or not the current lane is a segment head/tail
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
T output = input;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T temp;
ShuffleWord *temp_alias = reinterpret_cast<ShuffleWord *>(&temp);
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
// Get the start flags for each thread in the warp.
int warp_flags = __ballot(flag);
if (!HEAD_SEGMENTED)
warp_flags <<= 1;
// Keep bits above the current thread.
warp_flags &= LaneMaskGt();
// Accommodate packing of multiple logical warps in a single physical warp
if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
// Find next flag
int next_flag = __clz(__brev(warp_flags));
// Clip the next segment at the warp boundary if necessary
if (LOGICAL_WARP_THREADS != 32)
next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
const int OFFSET = 1 << STEP;
// Grab addend from peer
#pragma unroll
for (int WORD = 0; WORD < WORDS; ++WORD)
{
unsigned int shuffle_word = output_alias[WORD];
asm(
" shfl.down.b32 %0, %1, %2, %3;"
: "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
temp_alias[WORD] = (ShuffleWord) shuffle_word;
}
// Perform reduction op if valid
if (OFFSET < next_flag - lane_id)
output = reduction_op(output, temp);
}
return output;
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,291 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
*/
#pragma once
#include "../../thread/thread_operators.cuh"
#include "../../thread/thread_load.cuh"
#include "../../thread/thread_store.cuh"
#include "../../util_type.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
*/
template <
typename T, ///< Data type being reduced
int LOGICAL_WARPS, ///< Number of logical warps entrant
int LOGICAL_WARP_THREADS> ///< Number of threads per logical warp
struct WarpReduceSmem
{
/******************************************************************************
* Constants and typedefs
******************************************************************************/
enum
{
/// Whether the logical warp size is a power-of-two
POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
/// The number of warp scan steps
STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
/// The number of threads in half a warp
HALF_WARP_THREADS = 1 << (STEPS - 1),
/// The number of shared memory elements per warp
WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
};
/// Shared memory flag type
typedef unsigned char SmemFlag;
/// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************************
* Thread fields
******************************************************************************/
_TempStorage &temp_storage;
int warp_id;
int lane_id;
/******************************************************************************
* Construction
******************************************************************************/
/// Constructor
__device__ __forceinline__ WarpReduceSmem(
TempStorage &temp_storage,
int warp_id,
int lane_id)
:
temp_storage(temp_storage.Alias()),
warp_id(warp_id),
lane_id(lane_id)
{}
/******************************************************************************
* Operation
******************************************************************************/
/**
* Reduction
*/
template <
bool FULL_WARPS, ///< Whether all lanes in each warp are contributing a valid fold of items
int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane
typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input
int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp
ReductionOp reduction_op) ///< [in] Reduction operator
{
for (int STEP = 0; STEP < STEPS; STEP++)
{
const int OFFSET = 1 << STEP;
// Share input through buffer
ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
// Update input if peer_addend is in range
if ((FULL_WARPS && POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
{
T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
input = reduction_op(input, peer_addend);
}
}
return input;
}
/**
* Segmented reduction
*/
template <
bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail
typename Flag,
typename ReductionOp>
__device__ __forceinline__ T SegmentedReduce(
T input, ///< [in] Calling thread's input
Flag flag, ///< [in] Whether or not the current lane is a segment head/tail
ReductionOp reduction_op) ///< [in] Reduction operator
{
#if CUB_PTX_ARCH >= 200
// Ballot-based segmented reduce
// Get the start flags for each thread in the warp.
int warp_flags = __ballot(flag);
if (!HEAD_SEGMENTED)
warp_flags <<= 1;
// Keep bits above the current thread.
warp_flags &= LaneMaskGt();
// Accommodate packing of multiple logical warps in a single physical warp
if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
// Find next flag
int next_flag = __clz(__brev(warp_flags));
// Clip the next segment at the warp boundary if necessary
if (LOGICAL_WARP_THREADS != 32)
next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
for (int STEP = 0; STEP < STEPS; STEP++)
{
const int OFFSET = 1 << STEP;
// Share input into buffer
ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
// Update input if peer_addend is in range
if (OFFSET < next_flag - lane_id)
{
T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
input = reduction_op(input, peer_addend);
}
}
return input;
#else
// Smem-based segmented reduce
enum
{
UNSET = 0x0, // Is initially unset
SET = 0x1, // Is initially set
SEEN = 0x2, // Has seen another head flag from a successor peer
};
// Alias flags onto shared data storage
volatile SmemFlag *flag_storage = reinterpret_cast<SmemFlag*>(temp_storage[warp_id]);
SmemFlag flag_status = (flag) ? SET : UNSET;
for (int STEP = 0; STEP < STEPS; STEP++)
{
const int OFFSET = 1 << STEP;
// Share input through buffer
ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
// Get peer from buffer
T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
// Share flag through buffer
flag_storage[lane_id] = flag_status;
// Get peer flag from buffer
SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
// Update input if peer was in range
if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
{
if (HEAD_SEGMENTED)
{
// Head-segmented
if ((flag_status & SEEN) == 0)
{
// Has not seen a more distant head flag
if (peer_flag_status & SET)
{
// Has now seen a head flag
flag_status |= SEEN;
}
else
{
// Peer is not a head flag: grab its count
input = reduction_op(input, peer_addend);
}
// Update seen status to include that of peer
flag_status |= (peer_flag_status & SEEN);
}
}
else
{
// Tail-segmented. Simply propagate flag status
if (!flag_status)
{
input = reduction_op(input, peer_addend);
flag_status |= peer_flag_status;
}
}
}
}
return input;
#endif
}
/**
* Summation
*/
template <
bool FULL_WARPS, ///< Whether all lanes in each warp are contributing a valid fold of items
int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane
__device__ __forceinline__ T Sum(
T input, ///< [in] Calling thread's input
int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp
{
return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,371 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
*/
#pragma once
#include "../../thread/thread_operators.cuh"
#include "../../util_type.cuh"
#include "../../util_ptx.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
*/
template <
typename T, ///< Data type being scanned
int LOGICAL_WARPS, ///< Number of logical warps entrant
int LOGICAL_WARP_THREADS> ///< Number of threads per logical warp
struct WarpScanShfl
{
/******************************************************************************
* Constants and typedefs
******************************************************************************/
enum
{
/// The number of warp scan steps
STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
SHFL_C = ((-1 << STEPS) & 31) << 8,
};
/// Shared memory storage layout type
typedef NullType TempStorage;
/******************************************************************************
* Thread fields
******************************************************************************/
int warp_id;
int lane_id;
/******************************************************************************
* Construction
******************************************************************************/
/// Constructor
__device__ __forceinline__ WarpScanShfl(
TempStorage &temp_storage,
int warp_id,
int lane_id)
:
warp_id(warp_id),
lane_id(lane_id)
{}
/******************************************************************************
* Operation
******************************************************************************/
/// Broadcast
__device__ __forceinline__ T Broadcast(
T input, ///< [in] The value to broadcast
int src_lane) ///< [in] Which warp lane is to do the broadcasting
{
typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
#pragma unroll
for (int WORD = 0; WORD < WORDS; ++WORD)
{
unsigned int shuffle_word = input_alias[WORD];
asm("shfl.idx.b32 %0, %1, %2, %3;"
: "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(LOGICAL_WARP_THREADS - 1));
output_alias[WORD] = (ShuffleWord) shuffle_word;
}
return output;
}
//---------------------------------------------------------------------
// Inclusive operations
//---------------------------------------------------------------------
/// Inclusive prefix sum with aggregate (single-SHFL)
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
T &warp_aggregate, ///< [out] Warp-wide aggregate reduction of input items.
Int2Type<true> single_shfl)
{
unsigned int temp = reinterpret_cast<unsigned int &>(input);
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
// Use predicate set from SHFL to guard against invalid peers
asm(
"{"
" .reg .u32 r0;"
" .reg .pred p;"
" shfl.up.b32 r0|p, %1, %2, %3;"
" @p add.u32 r0, r0, %4;"
" mov.u32 %0, r0;"
"}"
: "=r"(temp) : "r"(temp), "r"(1 << STEP), "r"(SHFL_C), "r"(temp));
}
output = temp;
// Grab aggregate from last warp lane
warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
}
/// Inclusive prefix sum with aggregate (multi-SHFL)
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
T &warp_aggregate, ///< [out] Warp-wide aggregate reduction of input items.
Int2Type<false> single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required
{
// Delegate to generic scan
InclusiveScan(input, output, Sum(), warp_aggregate);
}
/// Inclusive prefix sum with aggregate (specialized for float)
__device__ __forceinline__ void InclusiveSum(
float input, ///< [in] Calling thread's input item.
float &output, ///< [out] Calling thread's output item. May be aliased with \p input.
float &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
output = input;
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
// Use predicate set from SHFL to guard against invalid peers
asm(
"{"
" .reg .f32 r0;"
" .reg .pred p;"
" shfl.up.b32 r0|p, %1, %2, %3;"
" @p add.f32 r0, r0, %4;"
" mov.f32 %0, r0;"
"}"
: "=f"(output) : "f"(output), "r"(1 << STEP), "r"(SHFL_C), "f"(output));
}
// Grab aggregate from last warp lane
warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
}
/// Inclusive prefix sum with aggregate (specialized for unsigned long long)
__device__ __forceinline__ void InclusiveSum(
unsigned long long input, ///< [in] Calling thread's input item.
unsigned long long &output, ///< [out] Calling thread's output item. May be aliased with \p input.
unsigned long long &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
output = input;
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
// Use predicate set from SHFL to guard against invalid peers
asm(
"{"
" .reg .u32 r0;"
" .reg .u32 r1;"
" .reg .u32 lo;"
" .reg .u32 hi;"
" .reg .pred p;"
" mov.b64 {lo, hi}, %1;"
" shfl.up.b32 r0|p, lo, %2, %3;"
" shfl.up.b32 r1|p, hi, %2, %3;"
" @p add.cc.u32 r0, r0, lo;"
" @p addc.u32 r1, r1, hi;"
" mov.b64 %0, {r0, r1};"
"}"
: "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C));
}
// Grab aggregate from last warp lane
warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
}
/// Inclusive prefix sum with aggregate (generic)
template <typename _T>
__device__ __forceinline__ void InclusiveSum(
_T input, ///< [in] Calling thread's input item.
_T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
_T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
// Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
InclusiveSum(input, output, warp_aggregate, single_shfl);
}
/// Inclusive prefix sum
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item.
T &output) ///< [out] Calling thread's output item. May be aliased with \p input.
{
T warp_aggregate;
InclusiveSum(input, output, warp_aggregate);
}
/// Inclusive scan with aggregate
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op, ///< [in] Binary scan operator
T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
output = input;
// Iterate scan steps
#pragma unroll
for (int STEP = 0; STEP < STEPS; STEP++)
{
// Grab addend from peer
const int OFFSET = 1 << STEP;
T temp = ShuffleUp(output, OFFSET);
// Perform scan op if from a valid peer
if (lane_id >= OFFSET)
output = scan_op(temp, output);
}
// Grab aggregate from last warp lane
warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
}
/// Inclusive scan
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op) ///< [in] Binary scan operator
{
T warp_aggregate;
InclusiveScan(input, output, scan_op, warp_aggregate);
}
//---------------------------------------------------------------------
// Exclusive operations
//---------------------------------------------------------------------
/// Exclusive scan with aggregate
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
T identity, ///< [in] Identity value
ScanOp scan_op, ///< [in] Binary scan operator
T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
// Compute inclusive scan
T inclusive;
InclusiveScan(input, inclusive, scan_op, warp_aggregate);
// Grab result from predecessor
T exclusive = ShuffleUp(inclusive, 1);
output = (lane_id == 0) ?
identity :
exclusive;
}
/// Exclusive scan
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
T identity, ///< [in] Identity value
ScanOp scan_op) ///< [in] Binary scan operator
{
T warp_aggregate;
ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
}
/// Exclusive scan with aggregate, without identity
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op, ///< [in] Binary scan operator
T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
// Compute inclusive scan
T inclusive;
InclusiveScan(input, inclusive, scan_op, warp_aggregate);
// Grab result from predecessor
output = ShuffleUp(inclusive, 1);
}
/// Exclusive scan without identity
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op) ///< [in] Binary scan operator
{
T warp_aggregate;
ExclusiveScan(input, output, scan_op, warp_aggregate);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,327 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
*/
#pragma once
#include "../../thread/thread_operators.cuh"
#include "../../thread/thread_load.cuh"
#include "../../thread/thread_store.cuh"
#include "../../util_type.cuh"
#include "../../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \brief WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
*/
template <
typename T, ///< Data type being scanned
int LOGICAL_WARPS, ///< Number of logical warps entrant
int LOGICAL_WARP_THREADS> ///< Number of threads per logical warp
struct WarpScanSmem
{
/******************************************************************************
* Constants and typedefs
******************************************************************************/
enum
{
/// The number of warp scan steps
STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
/// The number of threads in half a warp
HALF_WARP_THREADS = 1 << (STEPS - 1),
/// The number of shared memory elements per warp
WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
};
/// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************************
* Thread fields
******************************************************************************/
_TempStorage &temp_storage;
unsigned int warp_id;
unsigned int lane_id;
/******************************************************************************
* Construction
******************************************************************************/
/// Constructor
__device__ __forceinline__ WarpScanSmem(
TempStorage &temp_storage,
int warp_id,
int lane_id)
:
temp_storage(temp_storage.Alias()),
warp_id(warp_id),
lane_id(lane_id)
{}
/******************************************************************************
* Operation
******************************************************************************/
/// Initialize identity padding (specialized for operations that have identity)
__device__ __forceinline__ void InitIdentity(Int2Type<true> has_identity)
{
T identity = T();
ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
}
/// Initialize identity padding (specialized for operations without identity)
__device__ __forceinline__ void InitIdentity(Int2Type<false> has_identity)
{}
/// Basic inclusive scan iteration(template unrolled, base-case specialization)
template <
bool HAS_IDENTITY,
typename ScanOp>
__device__ __forceinline__ void ScanStep(
T &partial,
ScanOp scan_op,
Int2Type<STEPS> step)
{}
/// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
template <
bool HAS_IDENTITY,
int STEP,
typename ScanOp>
__device__ __forceinline__ void ScanStep(
T &partial,
ScanOp scan_op,
Int2Type<STEP> step)
{
const int OFFSET = 1 << STEP;
// Share partial into buffer
ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
// Update partial if addend is in range
if (HAS_IDENTITY || (lane_id >= OFFSET))
{
T addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - OFFSET]);
partial = scan_op(addend, partial);
}
ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
}
/// Broadcast
__device__ __forceinline__ T Broadcast(
T input, ///< [in] The value to broadcast
unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting
{
if (lane_id == src_lane)
{
ThreadStore<STORE_VOLATILE>(temp_storage[warp_id], input);
}
return ThreadLoad<LOAD_VOLATILE>(temp_storage[warp_id]);
}
/// Basic inclusive scan
template <
bool HAS_IDENTITY,
bool SHARE_FINAL,
typename ScanOp>
__device__ __forceinline__ T BasicScan(
T partial, ///< Calling thread's input partial reduction
ScanOp scan_op) ///< Binary associative scan functor
{
// Iterate scan steps
ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<0>());
if (SHARE_FINAL)
{
// Share partial into buffer
ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
}
return partial;
}
/// Inclusive prefix sum
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item.
T &output) ///< [out] Calling thread's output item. May be aliased with \p input.
{
const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
// Initialize identity region
InitIdentity(Int2Type<HAS_IDENTITY>());
// Compute inclusive warp scan (has identity, don't share final)
output = BasicScan<HAS_IDENTITY, false>(input, Sum());
}
/// Inclusive prefix sum with aggregate
__device__ __forceinline__ void InclusiveSum(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
// Initialize identity region
InitIdentity(Int2Type<HAS_IDENTITY>());
// Compute inclusive warp scan (has identity, share final)
output = BasicScan<HAS_IDENTITY, true>(input, Sum());
// Retrieve aggregate in <em>warp-lane</em><sub>0</sub>
warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
}
/// Inclusive scan
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op) ///< [in] Binary scan operator
{
// Compute inclusive warp scan (no identity, don't share final)
output = BasicScan<false, false>(input, scan_op);
}
/// Inclusive scan with aggregate
template <typename ScanOp>
__device__ __forceinline__ void InclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op, ///< [in] Binary scan operator
T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
// Compute inclusive warp scan (no identity, share final)
output = BasicScan<false, true>(input, scan_op);
// Retrieve aggregate
warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
}
/// Exclusive scan
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
T identity, ///< [in] Identity value
ScanOp scan_op) ///< [in] Binary scan operator
{
// Initialize identity region
ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
// Compute inclusive warp scan (identity, share final)
T inclusive = BasicScan<true, true>(input, scan_op);
// Retrieve exclusive scan
output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
}
/// Exclusive scan with aggregate
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
T identity, ///< [in] Identity value
ScanOp scan_op, ///< [in] Binary scan operator
T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
// Exclusive warp scan (which does share final)
ExclusiveScan(input, output, identity, scan_op);
// Retrieve aggregate
warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
}
/// Exclusive scan without identity
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op) ///< [in] Binary scan operator
{
// Compute inclusive warp scan (no identity, share final)
T inclusive = BasicScan<false, true>(input, scan_op);
// Retrieve exclusive scan
output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
}
/// Exclusive scan with aggregate, without identity
template <typename ScanOp>
__device__ __forceinline__ void ExclusiveScan(
T input, ///< [in] Calling thread's input item.
T &output, ///< [out] Calling thread's output item. May be aliased with \p input.
ScanOp scan_op, ///< [in] Binary scan operator
T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items.
{
// Exclusive warp scan (which does share final)
ExclusiveScan(input, output, scan_op);
// Retrieve aggregate
warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
}
};
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

View File

@ -0,0 +1,677 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads.
*/
#pragma once
#include "specializations/warp_reduce_shfl.cuh"
#include "specializations/warp_reduce_smem.cuh"
#include "../thread/thread_operators.cuh"
#include "../util_arch.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup WarpModule
* @{
*/
/**
* \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads. ![](warp_reduce_logo.png)
*
* \par Overview
* A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
* uses a binary combining operator to compute a single aggregate from a list of input elements.
*
* \tparam T The reduction input/output element type
* \tparam LOGICAL_WARPS <b>[optional]</b> The number of entrant "logical" warps performing concurrent warp reductions. Default is 1.
* \tparam LOGICAL_WARP_THREADS <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
*
* \par Simple Examples
* \warpcollective{WarpReduce}
* \par
* The code snippet below illustrates four concurrent warp sum reductions within a block of
* 128 threads (one per each of the 32-thread warps).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for 4 warps on type int
* typedef cub::WarpReduce<int, 4> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item per thread
* int thread_data = ...
*
* // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
* \p 2544, and \p 3568, respectively (and is undefined in other threads).
*
* \par
* The code snippet below illustrates a single warp sum reduction within a block of
* 128 threads.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for one warp on type int
* typedef cub::WarpReduce<int, 1> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
* ...
*
* // Only the first warp performs a reduction
* if (threadIdx.x < 32)
* {
* // Obtain one input item per thread
* int thread_data = ...
*
* // Return the warp-wide sum to lane0
* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the warp of threads is <tt>0, 1, 2, 3, ..., 31</tt>.
* The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
*
* \par Usage and Performance Considerations
* - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
* - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
* - Warp reductions are concurrent if more than one logical warp is participating
* - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
* - Uses synchronization-free communication between warp lanes when applicable
* - Zero bank conflicts for most types
* - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
* - Summation (<b><em>vs.</em></b> generic reduction)
* - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
*
*/
template <
typename T,
int LOGICAL_WARPS = 1,
int LOGICAL_WARP_THREADS = PtxArchProps::WARP_THREADS>
class WarpReduce
{
private:
/******************************************************************************
* Constants and typedefs
******************************************************************************/
enum
{
POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
};
public:
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
WarpReduceShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
WarpReduceSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpReduce;
#endif // DOXYGEN_SHOULD_SKIP_THIS
private:
/// Shared memory storage layout type for WarpReduce
typedef typename InternalWarpReduce::TempStorage _TempStorage;
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Warp ID
int warp_id;
/// Lane ID
int lane_id;
/******************************************************************************
* Utility methods
******************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ TempStorage private_storage;
return private_storage;
}
public:
/// \smemstorage{WarpReduce}
struct TempStorage : Uninitialized<_TempStorage> {};
/******************************************************************//**
* \name Collective constructors
*********************************************************************/
//@{
/**
* \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
*
*/
__device__ __forceinline__ WarpReduce()
:
temp_storage(PrivateStorage()),
warp_id((LOGICAL_WARPS == 1) ?
0 :
threadIdx.x / LOGICAL_WARP_THREADS),
lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
LaneId() :
threadIdx.x % LOGICAL_WARP_THREADS)
{}
/**
* \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
*/
__device__ __forceinline__ WarpReduce(
TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage
:
temp_storage(temp_storage.Alias()),
warp_id((LOGICAL_WARPS == 1) ?
0 :
threadIdx.x / LOGICAL_WARP_THREADS),
lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
LaneId() :
threadIdx.x % LOGICAL_WARP_THREADS)
{}
/**
* \brief Collective constructor using a private static allocation of shared memory as temporary storage. Threads are identified using the given warp and lane identifiers.
*/
__device__ __forceinline__ WarpReduce(
int warp_id, ///< [in] A suitable warp membership identifier
int lane_id) ///< [in] A lane identifier within the warp
:
temp_storage(PrivateStorage()),
warp_id(warp_id),
lane_id(lane_id)
{}
/**
* \brief Collective constructor using the specified memory allocation as temporary storage. Threads are identified using the given warp and lane identifiers.
*/
__device__ __forceinline__ WarpReduce(
TempStorage &temp_storage, ///< [in] Reference to memory allocation having layout type TempStorage
int warp_id, ///< [in] A suitable warp membership identifier
int lane_id) ///< [in] A lane identifier within the warp
:
temp_storage(temp_storage.Alias()),
warp_id(warp_id),
lane_id(lane_id)
{}
//@} end member group
/******************************************************************//**
* \name Summation reductions
*********************************************************************/
//@{
/**
* \brief Computes a warp-wide sum in each active warp. The output is valid in warp <em>lane</em><sub>0</sub>.
*
* \smemreuse
*
* The code snippet below illustrates four concurrent warp sum reductions within a block of
* 128 threads (one per each of the 32-thread warps).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for 4 warps on type int
* typedef cub::WarpReduce<int, 4> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item per thread
* int thread_data = ...
*
* // Return the warp-wide sums to each lane0
* int aggregate = WarpReduce(temp_storage).Sum(thread_data);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
* \p 2544, and \p 3568, respectively (and is undefined in other threads).
*
*/
__device__ __forceinline__ T Sum(
T input) ///< [in] Calling thread's input
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, LOGICAL_WARP_THREADS);
}
/**
* \brief Computes a partially-full warp-wide sum in each active warp. The output is valid in warp <em>lane</em><sub>0</sub>.
*
* All threads in each logical warp must agree on the same value for \p valid_items. Otherwise the result is undefined.
*
* \smemreuse
*
* The code snippet below illustrates a sum reduction within a single, partially-full
* block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, int valid_items)
* {
* // Specialize WarpReduce for a single warp on type int
* typedef cub::WarpReduce<int, 1> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item per thread if in range
* int thread_data;
* if (threadIdx.x < valid_items)
* thread_data = d_data[threadIdx.x];
*
* // Return the warp-wide sums to each lane0
* int aggregate = WarpReduce(temp_storage).Sum(
* thread_data, valid_items);
*
* \endcode
* \par
* Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
* is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is
* undefined in other threads).
*
*/
__device__ __forceinline__ T Sum(
T input, ///< [in] Calling thread's input
int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
{
// Determine if we don't need bounds checking
if (valid_items >= LOGICAL_WARP_THREADS)
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, valid_items);
}
else
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<false, 1>(input, valid_items);
}
}
/**
* \brief Computes a segmented sum in each active warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* \smemreuse
*
* The code snippet below illustrates a head-segmented warp sum
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int
* typedef cub::WarpReduce<int, 1> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item and flag per thread
* int thread_data = ...
* int head_flag = ...
*
* // Return the warp-wide sums to each lane0
* int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
* thread_data, head_flag);
*
* \endcode
* \par
* Suppose the set of input \p thread_data and \p head_flag across the block of threads
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
* \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*
*/
template <
typename Flag>
__device__ __forceinline__ T HeadSegmentedSum(
T input, ///< [in] Calling thread's input
Flag head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment
{
return HeadSegmentedReduce(input, head_flag, cub::Sum());
}
/**
* \brief Computes a segmented sum in each active warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* \smemreuse
*
* The code snippet below illustrates a tail-segmented warp sum
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int
* typedef cub::WarpReduce<int, 1> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item and flag per thread
* int thread_data = ...
* int tail_flag = ...
*
* // Return the warp-wide sums to each lane0
* int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
* thread_data, tail_flag);
*
* \endcode
* \par
* Suppose the set of input \p thread_data and \p tail_flag across the block of threads
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
* \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename Flag>
__device__ __forceinline__ T TailSegmentedSum(
T input, ///< [in] Calling thread's input
Flag tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment
{
return TailSegmentedReduce(input, tail_flag, cub::Sum());
}
//@} end member group
/******************************************************************//**
* \name Generic reductions
*********************************************************************/
//@{
/**
* \brief Computes a warp-wide reduction in each active warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates four concurrent warp max reductions within a block of
* 128 threads (one per each of the 32-thread warps).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for 4 warps on type int
* typedef cub::WarpReduce<int, 4> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item per thread
* int thread_data = ...
*
* // Return the warp-wide reductions to each lane0
* int aggregate = WarpReduce(temp_storage).Reduce(
* thread_data, cub::Max());
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
* The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
* \p 95, and \p 127, respectively (and is undefined in other threads).
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
}
/**
* \brief Computes a partially-full warp-wide reduction in each active warp using the specified binary reduction functor. The output is valid in warp <em>lane</em><sub>0</sub>.
*
* All threads in each logical warp must agree on the same value for \p valid_items. Otherwise the result is undefined.
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates a max reduction within a single, partially-full
* block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(int *d_data, int valid_items)
* {
* // Specialize WarpReduce for a single warp on type int
* typedef cub::WarpReduce<int, 1> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item per thread if in range
* int thread_data;
* if (threadIdx.x < valid_items)
* thread_data = d_data[threadIdx.x];
*
* // Return the warp-wide reductions to each lane0
* int aggregate = WarpReduce(temp_storage).Reduce(
* thread_data, cub::Max(), valid_items);
*
* \endcode
* \par
* Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
* is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is
* undefined in other threads).
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <typename ReductionOp>
__device__ __forceinline__ T Reduce(
T input, ///< [in] Calling thread's input
ReductionOp reduction_op, ///< [in] Binary reduction operator
int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
{
// Determine if we don't need bounds checking
if (valid_items >= LOGICAL_WARP_THREADS)
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, valid_items, reduction_op);
}
else
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<false, 1>(input, valid_items, reduction_op);
}
}
/**
* \brief Computes a segmented reduction in each active warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates a head-segmented warp max
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int
* typedef cub::WarpReduce<int, 1> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item and flag per thread
* int thread_data = ...
* int head_flag = ...
*
* // Return the warp-wide reductions to each lane0
* int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
* thread_data, head_flag, cub::Max());
*
* \endcode
* \par
* Suppose the set of input \p thread_data and \p head_flag across the block of threads
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
* \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename ReductionOp,
typename Flag>
__device__ __forceinline__ T HeadSegmentedReduce(
T input, ///< [in] Calling thread's input
Flag head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment
ReductionOp reduction_op) ///< [in] Reduction operator
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<true>(input, head_flag, reduction_op);
}
/**
* \brief Computes a segmented reduction in each active warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
*
* Supports non-commutative reduction operators
*
* \smemreuse
*
* The code snippet below illustrates a tail-segmented warp max
* reduction within a block of 32 threads (one warp).
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Specialize WarpReduce for a single warp on type int
* typedef cub::WarpReduce<int, 1> WarpReduce;
*
* // Allocate shared memory for WarpReduce
* __shared__ typename WarpReduce::TempStorage temp_storage;
*
* // Obtain one input item and flag per thread
* int thread_data = ...
* int tail_flag = ...
*
* // Return the warp-wide reductions to each lane0
* int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
* thread_data, tail_flag, cub::Max());
*
* \endcode
* \par
* Suppose the set of input \p thread_data and \p tail_flag across the block of threads
* is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
* respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
* \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
*
* \tparam ReductionOp <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
*/
template <
typename ReductionOp,
typename Flag>
__device__ __forceinline__ T TailSegmentedReduce(
T input, ///< [in] Calling thread's input
Flag tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
ReductionOp reduction_op) ///< [in] Reduction operator
{
return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<false>(input, tail_flag, reduction_op);
}
//@} end member group
};
/** @} */ // end group WarpModule
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,411 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_BITSET_HPP
#define KOKKOS_BITSET_HPP
#include <Kokkos_Macros.hpp>
#include <Kokkos_Functional.hpp>
#include <Kokkos_View.hpp>
#include <Kokkos_Atomic.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_Pair.hpp>
#include <impl/Kokkos_Bitset_impl.hpp>
#include <stdexcept>
namespace Kokkos {
template <typename Device>
class Bitset;
template <typename Device>
class ConstBitset;
template <typename DstDevice, typename SrcDevice>
void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
template <typename DstDevice, typename SrcDevice>
void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
template <typename DstDevice, typename SrcDevice>
void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
/// A thread safe bitset
template <typename Device>
class Bitset
{
public:
typedef Device device_type;
typedef unsigned size_type;
enum { BIT_SCAN_REVERSE = 1u };
enum { MOVE_HINT_BACKWARD = 2u };
enum {
BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u
, BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE
, BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD
, BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
};
private:
enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
enum { block_mask = block_size-1u };
enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
public:
Bitset(unsigned arg_size = 0u)
: m_size(arg_size)
, m_last_block_mask(0u)
, m_blocks("Bitset", ((m_size + block_mask) >> block_shift) )
{
for (int i=0, end = static_cast<int>(m_size & block_mask); i < end; ++i) {
m_last_block_mask |= 1u << i;
}
}
Bitset<Device> & operator = (Bitset<Device> const & rhs)
{
this->m_size = rhs.m_size;
this->m_last_block_mask = rhs.m_last_block_mask;
this->m_blocks = rhs.m_blocks;
return *this;
}
Bitset( Bitset<Device> const & rhs)
: m_size( rhs.m_size )
, m_last_block_mask( rhs.m_last_block_mask )
, m_blocks( rhs.m_blocks )
{}
KOKKOS_FORCEINLINE_FUNCTION
unsigned size() const
{ return m_size; }
unsigned count() const
{
Impl::BitsetCount< Bitset<Device> > f(*this);
return f.apply();
}
void set()
{
Kokkos::deep_copy(m_blocks, ~0u );
if (m_last_block_mask) {
//clear the unused bits in the last block
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
raw_deep_copy( m_blocks.ptr_on_device() + (m_blocks.size() -1u), &m_last_block_mask, sizeof(unsigned));
}
}
void reset()
{
Kokkos::deep_copy(m_blocks, 0u );
}
void clear()
{
Kokkos::deep_copy(m_blocks, 0u );
}
KOKKOS_FORCEINLINE_FUNCTION
bool set( unsigned i ) const
{
if ( i < m_size ) {
unsigned * block_ptr = &m_blocks[ i >> block_shift ];
const unsigned mask = 1u << static_cast<int>( i & block_mask );
return !( atomic_fetch_or( block_ptr, mask ) & mask );
}
return false;
}
KOKKOS_FORCEINLINE_FUNCTION
bool reset( unsigned i ) const
{
if ( i < m_size ) {
unsigned * block_ptr = &m_blocks[ i >> block_shift ];
const unsigned mask = 1u << static_cast<int>( i & block_mask );
return atomic_fetch_and( block_ptr, ~mask ) & mask;
}
return false;
}
KOKKOS_FORCEINLINE_FUNCTION
bool test( unsigned i ) const
{
if ( i < m_size ) {
const unsigned block = volatile_load(&m_blocks[ i >> block_shift ]);
const unsigned mask = 1u << static_cast<int>( i & block_mask );
return block & mask;
}
return false;
}
KOKKOS_FORCEINLINE_FUNCTION
unsigned max_hint() const
{
return m_blocks.size();
}
KOKKOS_INLINE_FUNCTION
Kokkos::pair<bool, unsigned> find_any_set_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
{
const unsigned block_idx = (hint >> block_shift) < m_blocks.size() ? (hint >> block_shift) : 0;
const unsigned offset = hint & block_mask;
unsigned block = volatile_load(&m_blocks[ block_idx ]);
block = !m_last_block_mask || (block_idx < (m_blocks.size()-1)) ? block : block & m_last_block_mask ;
return find_any_helper(block_idx, offset, block, scan_direction);
}
KOKKOS_INLINE_FUNCTION
Kokkos::pair<bool, unsigned> find_any_unset_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
{
const unsigned block_idx = hint >> block_shift;
const unsigned offset = hint & block_mask;
unsigned block = volatile_load(&m_blocks[ block_idx ]);
block = !m_last_block_mask || (block_idx < (m_blocks.size()-1) ) ? ~block : ~block & m_last_block_mask ;
return find_any_helper(block_idx, offset, block, scan_direction);
}
private:
KOKKOS_FORCEINLINE_FUNCTION
Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx, unsigned offset, unsigned block, unsigned scan_direction) const
{
Kokkos::pair<bool, unsigned> result( block > 0u, 0);
if (!result.first) {
result.second = update_hint( block_idx, offset, scan_direction );
}
else {
result.second = scan_block( (block_idx << block_shift)
, offset
, block
, scan_direction
);
}
return result;
}
KOKKOS_FORCEINLINE_FUNCTION
unsigned scan_block(unsigned block_start, int offset, unsigned block, unsigned scan_direction ) const
{
offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask;
block = Impl::rotate_right(block, offset);
return ((( !(scan_direction & BIT_SCAN_REVERSE) ?
Impl::bit_scan_forward(block) :
Impl::bit_scan_reverse(block)
) + offset
) & block_mask
) + block_start;
}
KOKKOS_FORCEINLINE_FUNCTION
unsigned update_hint( long long block_idx, unsigned offset, unsigned scan_direction ) const
{
block_idx += scan_direction & MOVE_HINT_BACKWARD ? -1 : 1;
block_idx = block_idx >= 0 ? block_idx : m_blocks.size() - 1;
block_idx = block_idx < static_cast<long long>(m_blocks.size()) ? block_idx : 0;
return static_cast<unsigned>(block_idx)*block_size + offset;
}
private:
unsigned m_size;
unsigned m_last_block_mask;
View< unsigned *, device_type, MemoryTraits<RandomAccess> > m_blocks;
private:
template <typename DDevice>
friend class Bitset;
template <typename DDevice>
friend class ConstBitset;
template <typename Bitset>
friend struct Impl::BitsetCount;
template <typename DstDevice, typename SrcDevice>
friend void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
template <typename DstDevice, typename SrcDevice>
friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
};
template <typename Device>
class ConstBitset
{
public:
typedef Device device_type;
typedef unsigned size_type;
private:
enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
enum { block_mask = block_size -1u };
enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
public:
ConstBitset()
: m_size (0)
{}
ConstBitset(Bitset<Device> const& rhs)
: m_size(rhs.m_size)
, m_blocks(rhs.m_blocks)
{}
ConstBitset(ConstBitset<Device> const& rhs)
: m_size( rhs.m_size )
, m_blocks( rhs.m_blocks )
{}
ConstBitset<Device> & operator = (Bitset<Device> const & rhs)
{
this->m_size = rhs.m_size;
this->m_blocks = rhs.m_blocks;
return *this;
}
ConstBitset<Device> & operator = (ConstBitset<Device> const & rhs)
{
this->m_size = rhs.m_size;
this->m_blocks = rhs.m_blocks;
return *this;
}
KOKKOS_FORCEINLINE_FUNCTION
unsigned size() const
{
return m_size;
}
unsigned count() const
{
Impl::BitsetCount< ConstBitset<Device> > f(*this);
return f.apply();
}
KOKKOS_FORCEINLINE_FUNCTION
bool test( unsigned i ) const
{
if ( i < m_size ) {
const unsigned block = m_blocks[ i >> block_shift ];
const unsigned mask = 1u << static_cast<int>( i & block_mask );
return block & mask;
}
return false;
}
private:
unsigned m_size;
View< const unsigned *, device_type, MemoryTraits<RandomAccess> > m_blocks;
private:
template <typename DDevice>
friend class ConstBitset;
template <typename Bitset>
friend struct Impl::BitsetCount;
template <typename DstDevice, typename SrcDevice>
friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
template <typename DstDevice, typename SrcDevice>
friend void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
};
template <typename DstDevice, typename SrcDevice>
void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src)
{
if (dst.size() != src.size()) {
throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
}
typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.size());
}
template <typename DstDevice, typename SrcDevice>
void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
{
if (dst.size() != src.size()) {
throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
}
typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.size());
}
template <typename DstDevice, typename SrcDevice>
void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
{
if (dst.size() != src.size()) {
throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
}
typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.size());
}
} // namespace Kokkos
#endif //KOKKOS_BITSET_HPP

View File

@ -0,0 +1,636 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_DualView.hpp
/// \brief Declaration and definition of Kokkos::DualView.
///
/// This header file declares and defines Kokkos::DualView and its
/// related nonmember functions.
#ifndef KOKKOS_DUALVIEW_HPP
#define KOKKOS_DUALVIEW_HPP
#include <Kokkos_View.hpp>
namespace Kokkos {
/* \class DualView
* \brief Container to manage mirroring a Kokkos::View that lives
* in device memory with a Kokkos::View that lives in host memory.
*
* This class provides capabilities to manage data which exists in two
* memory spaces at the same time. It keeps views of the same layout
* on two memory spaces as well as modified flags for both
* allocations. Users are responsible for setting the modified flags
* manually if they change the data in either memory space, by calling
* the sync() method templated on the device where they modified the
* data. Users may synchronize data by calling the modify() function,
* templated on the device towards which they want to synchronize
* (i.e., the target of the one-way copy operation).
*
* The DualView class also provides convenience methods such as
* realloc, resize and capacity which call the appropriate methods of
* the underlying Kokkos::View objects.
*
* The four template arguments are the same as those of Kokkos::View.
* (Please refer to that class' documentation for a detailed
* description.)
*
* \tparam DataType The type of the entries stored in the container.
*
* \tparam Layout The array's layout in memory.
*
* \tparam Device The Kokkos Device type. If its memory space is
* not the same as the host's memory space, then DualView will
* contain two separate Views: one in device memory, and one in
* host memory. Otherwise, DualView will only store one View.
*
* \tparam MemoryTraits (optional) The user's intended memory access
* behavior. Please see the documentation of Kokkos::View for
* examples. The default suffices for most users.
*/
template< class T , class L , class D, class M = MemoryManaged>
class DualView {
public:
//! \name Typedefs for device types and various Kokkos::View specializations.
//@{
//! The Kokkos Device type; same as the \c Device template parameter.
typedef D device_type;
//! The host mirror Kokkos Device type.
typedef typename D::host_mirror_device_type host_mirror_device_type;
//! The type of a Kokkos::View on the device.
typedef Kokkos::View<T,L,D,M> t_dev ;
/// \typedef t_host
/// \brief The type of a Kokkos::View host mirror of \c t_dev.
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
typedef t_dev t_host;
#else
typedef typename t_dev::HostMirror t_host ;
#endif
//! The type of a const View on the device.
typedef Kokkos::View<typename t_dev::const_data_type,L,D,M> t_dev_const;
/// \typedef t_host_const
/// \brief The type of a const View host mirror of \c t_dev_const.
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
typedef t_dev_const t_host_const;
#else
typedef typename t_dev_const::HostMirror t_host_const;
#endif
//! The type of a const, random-access View on the device.
typedef Kokkos::View<typename t_dev::const_data_type,L,D,Kokkos::MemoryRandomAccess> t_dev_const_randomread ;
/// \typedef t_host_const_randomread
/// \brief The type of a const, random-access View host mirror of
/// \c t_dev_const_randomread.
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
typedef t_dev_const_randomread t_host_const_randomread;
#else
typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
#endif
//! The type of an unmanaged View on the device.
typedef Kokkos::View<T, L, D, Kokkos::MemoryUnmanaged> t_dev_um;
//! The type of an unmanaged View host mirror of \c t_dev_um.
typedef Kokkos::View<typename t_host::data_type,
typename t_host::array_layout,
typename t_host::device_type,
Kokkos::MemoryUnmanaged> t_host_um;
//! The type of a const unmanaged View on the device.
typedef Kokkos::View<typename t_dev::const_data_type, L, D,
Kokkos::MemoryUnmanaged> t_dev_const_um;
//! The type of a const unmanaged View host mirror of \c t_dev_const_um.
typedef Kokkos::View<typename t_host::const_data_type,
typename t_host::array_layout,
typename t_host::device_type,
Kokkos::MemoryUnmanaged> t_host_const_um;
//@}
//! \name The same typedefs as a View for scalar, data, and value types.
//@{
typedef typename t_dev::value_type value_type;
typedef typename t_dev::const_value_type const_value_type;
typedef typename t_dev::non_const_value_type non_const_value_type;
//@}
//! \name The two View instances.
//@{
t_dev d_view;
t_host h_view;
//@}
//! \name Counters to keep track of changes ("modified" flags)
//@{
View<unsigned int,LayoutLeft,host_mirror_device_type> modified_device;
View<unsigned int,LayoutLeft,host_mirror_device_type> modified_host;
//@}
//! \name Constructors
//@{
/// \brief Empty constructor.
///
/// Both device and host View objects are constructed using their
/// default constructors. The "modified" flags are both initialized
/// to "unmodified."
DualView () :
modified_device (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_device")),
modified_host (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_host"))
{}
/// \brief Constructor that allocates View objects on both host and device.
///
/// This constructor works like the analogous constructor of View.
/// The first argument is a string label, which is entirely for your
/// benefit. (Different DualView objects may have the same label if
/// you like.) The arguments that follow are the dimensions of the
/// View objects. For example, if the View has three dimensions,
/// the first three integer arguments will be nonzero, and you may
/// omit the integer arguments that follow.
DualView (const std::string& label,
const size_t n0 = 0,
const size_t n1 = 0,
const size_t n2 = 0,
const size_t n3 = 0,
const size_t n4 = 0,
const size_t n5 = 0,
const size_t n6 = 0,
const size_t n7 = 0)
: d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
, h_view (d_view) // with UVM, host View is _always_ a shallow copy
#else
, h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
#endif
, modified_device (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_device"))
, modified_host (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_host"))
{}
//! Copy constructor (shallow copy)
template<class SS, class LS, class DS, class MS>
DualView (const DualView<SS,LS,DS,MS>& src) :
d_view (src.d_view),
h_view (src.h_view),
modified_device (src.modified_device),
modified_host (src.modified_host)
{}
/// \brief Create DualView from existing device and host View objects.
///
/// This constructor assumes that the device and host View objects
/// are synchronized. You, the caller, are responsible for making
/// sure this is the case before calling this constructor. After
/// this constructor returns, you may use DualView's sync() and
/// modify() methods to ensure synchronization of the View objects.
///
/// \param d_view_ Device View
/// \param h_view_ Host View (must have type t_host = t_dev::HostMirror)
DualView (const t_dev& d_view_, const t_host& h_view_) :
d_view (d_view_),
h_view (h_view_),
modified_device (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_device")),
modified_host (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_host"))
{
Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
}
//@}
//! \name Methods for synchronizing, marking as modified, and getting Views.
//@{
/// \brief Return a View on a specific device \c Device.
///
/// Please don't be afraid of the if_c expression in the return
/// value's type. That just tells the method what the return type
/// should be: t_dev if the \c Device template parameter matches
/// this DualView's device type, else t_host.
///
/// For example, suppose you create a DualView on Cuda, like this:
/// \code
/// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda> dual_view_type;
/// dual_view_type DV ("my dual view", 100);
/// \endcode
/// If you want to get the CUDA device View, do this:
/// \code
/// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> ();
/// \endcode
/// and if you want to get the host mirror of that View, do this:
/// \code
/// typedef typename Kokkos::Cuda::host_mirror_device_type host_device_type;
/// typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
/// \endcode
template< class Device >
const typename Kokkos::Impl::if_c<
Kokkos::Impl::is_same<typename t_dev::memory_space,
typename Device::memory_space>::value,
t_dev,
t_host>::type view () const
{
return Kokkos::Impl::if_c<
Kokkos::Impl::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value,
t_dev,
t_host >::select (d_view , h_view);
}
/// \brief Update data on device or host only if data in the other
/// space has been marked as modified.
///
/// If \c Device is the same as this DualView's device type, then
/// copy data from host to device. Otherwise, copy data from device
/// to host. In either case, only copy if the source of the copy
/// has been modified.
///
/// This is a one-way synchronization only. If the target of the
/// copy has been modified, this operation will discard those
/// modifications. It will also reset both device and host modified
/// flags.
///
/// \note This method doesn't know on its own whether you modified
/// the data in either View. You must manually mark modified data
/// as modified, by calling the modify() method with the
/// appropriate template parameter.
template<class Device>
void sync () {
const unsigned int dev =
Kokkos::Impl::if_c<
Kokkos::Impl::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value ,
unsigned int,
unsigned int>::select (1, 0);
if (dev) { // if Device is the same as DualView's device type
if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
Kokkos::deep_copy (d_view, h_view);
modified_host() = modified_device() = 0;
}
} else { // hopefully Device is the same as DualView's host type
if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
Kokkos::deep_copy (h_view, d_view);
modified_host() = modified_device() = 0;
}
}
}
/// \brief Mark data as modified on the given device \c Device.
///
/// If \c Device is the same as this DualView's device type, then
/// mark the device's data as modified. Otherwise, mark the host's
/// data as modified.
template<class Device>
void modify () {
const unsigned int dev =
Kokkos::Impl::if_c<
Kokkos::Impl::is_same<
typename t_dev::memory_space,
typename Device::memory_space>::value,
unsigned int,
unsigned int>::select (1, 0);
if (dev) { // if Device is the same as DualView's device type
// Increment the device's modified count.
modified_device () = (modified_device () > modified_host () ?
modified_device () : modified_host ()) + 1;
} else { // hopefully Device is the same as DualView's host type
// Increment the host's modified count.
modified_host () = (modified_device () > modified_host () ?
modified_device () : modified_host ()) + 1;
}
}
//@}
//! \name Methods for reallocating or resizing the View objects.
//@{
/// \brief Reallocate both View objects.
///
/// This discards any existing contents of the objects, and resets
/// their modified flags. It does <i>not</i> copy the old contents
/// of either View into the new View objects.
void realloc( const size_t n0 = 0 ,
const size_t n1 = 0 ,
const size_t n2 = 0 ,
const size_t n3 = 0 ,
const size_t n4 = 0 ,
const size_t n5 = 0 ,
const size_t n6 = 0 ,
const size_t n7 = 0 ) {
Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
h_view = d_view ;
#else
h_view = create_mirror_view( d_view );
#endif
/* Reset dirty flags */
modified_device() = modified_host() = 0;
}
/// \brief Resize both views, copying old contents into new if necessary.
///
/// This method only copies the old contents into the new View
/// objects for the device which was last marked as modified.
void resize( const size_t n0 = 0 ,
const size_t n1 = 0 ,
const size_t n2 = 0 ,
const size_t n3 = 0 ,
const size_t n4 = 0 ,
const size_t n5 = 0 ,
const size_t n6 = 0 ,
const size_t n7 = 0 ) {
if(modified_device() >= modified_host()) {
/* Resize on Device */
Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
h_view = d_view ;
#else
h_view = create_mirror_view( d_view );
#endif
/* Mark Device copy as modified */
modified_device() = modified_device()+1;
} else {
/* Realloc on Device */
Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
t_host temp_view = d_view ;
#else
t_host temp_view = create_mirror_view( d_view );
#endif
/* Remap on Host */
Kokkos::Impl::ViewRemap< t_host , t_host >( temp_view , h_view );
h_view = temp_view;
/* Mark Host copy as modified */
modified_host() = modified_host()+1;
}
}
//@}
//! \name Methods for getting capacity, stride, or dimension(s).
//@{
//! The allocation size (same as Kokkos::View::capacity).
size_t capacity() const {
return d_view.capacity();
}
//! Get stride(s) for each dimension.
template< typename iType>
void stride(iType* stride_) const {
d_view.stride(stride_);
}
/* \brief return size of dimension 0 */
size_t dimension_0() const {return d_view.dimension_0();}
/* \brief return size of dimension 1 */
size_t dimension_1() const {return d_view.dimension_1();}
/* \brief return size of dimension 2 */
size_t dimension_2() const {return d_view.dimension_2();}
/* \brief return size of dimension 3 */
size_t dimension_3() const {return d_view.dimension_3();}
/* \brief return size of dimension 4 */
size_t dimension_4() const {return d_view.dimension_4();}
/* \brief return size of dimension 5 */
size_t dimension_5() const {return d_view.dimension_5();}
/* \brief return size of dimension 6 */
size_t dimension_6() const {return d_view.dimension_6();}
/* \brief return size of dimension 7 */
size_t dimension_7() const {return d_view.dimension_7();}
//@}
};
//
// Partial specializations of Kokkos::subview() for DualView objects.
//
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 , class ArgType1 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 , class ArgType1 , class ArgType2 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 , class ArgType5 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 ,
const ArgType5 & arg5 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 , class ArgType5 , class ArgType6 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 ,
const ArgType5 & arg5 ,
const ArgType6 & arg6 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
template< class DstViewType ,
class T , class L , class D , class M ,
class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
DstViewType
subview( const DualView<T,L,D,M> & src ,
const ArgType0 & arg0 ,
const ArgType1 & arg1 ,
const ArgType2 & arg2 ,
const ArgType3 & arg3 ,
const ArgType4 & arg4 ,
const ArgType5 & arg5 ,
const ArgType6 & arg6 ,
const ArgType7 & arg7 )
{
DstViewType sub_view;
sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
sub_view.modified_device = src.modified_device;
sub_view.modified_host = src.modified_host;
return sub_view;
}
//
// Partial specialization of Kokkos::deep_copy() for DualView objects.
//
template< class DT , class DL , class DD , class DM ,
class ST , class SL , class SD , class SM >
void
deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
const DualView<ST,SL,SD,SM>& src)
{
if (src.modified_device () >= src.modified_host ()) {
Kokkos::deep_copy (dst.d_view, src.d_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
} else {
Kokkos::deep_copy (dst.h_view, src.h_view);
dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_device_type> ();
}
}
} // namespace Kokkos
#endif

View File

@ -0,0 +1,132 @@
#ifndef KOKKOS_FUNCTIONAL_HPP
#define KOKKOS_FUNCTIONAL_HPP
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_Functional_impl.hpp>
namespace Kokkos {
// These should work for most types
template <typename T>
struct pod_hash
{
typedef T argument_type;
typedef T first_argument_type;
typedef uint32_t second_argument_type;
typedef uint32_t result_type;
KOKKOS_FORCEINLINE_FUNCTION
uint32_t operator()(T const & t) const
{ return Impl::MurmurHash3_x86_32( &t, sizeof(T), 0); }
KOKKOS_FORCEINLINE_FUNCTION
uint32_t operator()(T const & t, uint32_t seed) const
{ return Impl::MurmurHash3_x86_32( &t, sizeof(T), seed); }
};
template <typename T>
struct pod_equal_to
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return Impl::bitwise_equal(&a,&b); }
};
template <typename T>
struct pod_not_equal_to
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return !Impl::bitwise_equal(&a,&b); }
};
template <typename T>
struct equal_to
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return a == b; }
};
template <typename T>
struct not_equal_to
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return a != b; }
};
template <typename T>
struct greater
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return a > b; }
};
template <typename T>
struct less
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return a < b; }
};
template <typename T>
struct greater_equal
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return a >= b; }
};
template <typename T>
struct less_equal
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
KOKKOS_FORCEINLINE_FUNCTION
bool operator()(T const & a, T const & b) const
{ return a <= b; }
};
} // namespace Kokkos
#endif //KOKKOS_FUNCTIONAL_HPP

View File

@ -0,0 +1,227 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_STATICCRSGRAPH_HPP
#define KOKKOS_STATICCRSGRAPH_HPP
#include <string>
#include <vector>
#include <Kokkos_View.hpp>
#include <Kokkos_Parallel.hpp> // for parallel_reduce
namespace Kokkos {
/// \class StaticCrsGraph
/// \brief Compressed row storage array.
///
/// \tparam DataType The type of stored entries. If a StaticCrsGraph is
/// used as the graph of a sparse matrix, then this is usually an
/// integer type, the type of the column indices in the sparse
/// matrix.
///
/// \tparam Arg1Type The second template parameter, corresponding
/// either to the Device type (if there are no more template
/// parameters) or to the Layout type (if there is at least one more
/// template parameter).
///
/// \tparam Arg2Type The third template parameter, which if provided
/// corresponds to the Device type.
///
/// \tparam SizeType The type of row offsets. Usually the default
/// parameter suffices. However, setting a nondefault value is
/// necessary in some cases, for example, if you want to have a
/// sparse matrices with dimensions (and therefore column indices)
/// that fit in \c int, but want to store more than <tt>INT_MAX</tt>
/// entries in the sparse matrix.
///
/// A row has a range of entries:
/// <ul>
/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li>
/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
/// </ul>
template< class DataType,
class Arg1Type,
class Arg2Type = void,
typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
class StaticCrsGraph {
private:
typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
public:
typedef DataType data_type;
typedef typename traits::array_layout array_layout;
typedef typename traits::device_type device_type;
typedef SizeType size_type;
typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
typedef StaticCrsGraph< DataType , array_layout , typename device_type::host_mirror_device_type , SizeType > HostMirror;
//typedef StaticCrsGraph< DataType , array_layout , Kokkos::Threads , SizeType > HostMirror;
typedef View< const size_type* , array_layout, device_type > row_map_type;
typedef View< DataType* , array_layout, device_type > entries_type;
entries_type entries;
row_map_type row_map;
//! Construct an empty view.
StaticCrsGraph () : entries(), row_map() {}
//! Copy constructor (shallow copy).
StaticCrsGraph (const StaticCrsGraph& rhs) : entries (rhs.entries), row_map (rhs.row_map)
{}
template<class EntriesType, class RowMapType>
StaticCrsGraph (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
{}
/** \brief Assign to a view of the rhs array.
* If the old view is the last view
* then allocated memory is deallocated.
*/
StaticCrsGraph& operator= (const StaticCrsGraph& rhs) {
entries = rhs.entries;
row_map = rhs.row_map;
return *this;
}
/** \brief Destroy this view of the array.
* If the last view then allocated memory is deallocated.
*/
~StaticCrsGraph() {}
size_t numRows() const {
return row_map.dimension_0()>0?row_map.dimension_0()-1:0;
}
};
//----------------------------------------------------------------------------
template< class StaticCrsGraphType , class InputSizeType >
typename StaticCrsGraphType::staticcrsgraph_type
create_staticcrsgraph( const std::string & label ,
const std::vector< InputSizeType > & input );
template< class StaticCrsGraphType , class InputSizeType >
typename StaticCrsGraphType::staticcrsgraph_type
create_staticcrsgraph( const std::string & label ,
const std::vector< std::vector< InputSizeType > > & input );
//----------------------------------------------------------------------------
template< class DataType ,
class Arg1Type ,
class Arg2Type ,
typename SizeType >
typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
template< class DataType ,
class Arg1Type ,
class Arg2Type ,
typename SizeType >
typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include <impl/Kokkos_StaticCrsGraph_factory.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class GraphType >
struct StaticCrsGraphMaximumEntry {
typedef typename GraphType::device_type device_type ;
typedef typename GraphType::data_type value_type ;
const typename GraphType::entries_type entries ;
StaticCrsGraphMaximumEntry( const GraphType & graph ) : entries( graph.entries ) {}
KOKKOS_INLINE_FUNCTION
void operator()( const unsigned i , value_type & update ) const
{ if ( update < entries(i) ) update = entries(i); }
KOKKOS_INLINE_FUNCTION
void init( value_type & update ) const
{ update = 0 ; }
KOKKOS_INLINE_FUNCTION
void join( volatile value_type & update ,
volatile const value_type & input ) const
{ if ( update < input ) update = input ; }
};
}
template< class DataType, class Arg1Type, class Arg2Type, typename SizeType >
DataType maximum_entry( const StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > & graph )
{
typedef StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType> GraphType ;
typedef Impl::StaticCrsGraphMaximumEntry< GraphType > FunctorType ;
DataType result = 0 ;
Kokkos::parallel_reduce( graph.entries.dimension_0(),
FunctorType(graph), result );
return result ;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_CRSARRAY_HPP */

View File

@ -0,0 +1,862 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_UnorderedMap.hpp
/// \brief Declaration and definition of Kokkos::UnorderedMap.
///
/// This header file declares and defines Kokkos::UnorderedMap and its
/// related nonmember functions.
#ifndef KOKKOS_UNORDERED_MAP_HPP
#define KOKKOS_UNORDERED_MAP_HPP
#include <Kokkos_Macros.hpp>
#include <Kokkos_Functional.hpp>
#include <Kokkos_View.hpp>
#include <Kokkos_Atomic.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_Bitset.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_UnorderedMap_impl.hpp>
#include <iostream>
#include <stdint.h>
#include <stdexcept>
#if (defined( __GNUC__ ) || defined( __GNUG__ )) && not defined( __CUDACC__ )
#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
#else
#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
#endif
namespace Kokkos {
enum { UnorderedMapInvalidIndex = ~0u };
/// \brief First element of the return value of UnorderedMap::insert().
///
/// Inserting an element into an UnorderedMap is not guaranteed to
/// succeed. There are three possible conditions:
/// <ol>
/// <li> <tt>INSERT_FAILED</tt>: The insert failed. This usually
/// means that the UnorderedMap ran out of space. </li>
/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
/// did <i>not</i> exist in the table before. </li>
/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
/// <i>did</i> exist in the table before. The new value was
/// ignored and the old value was left in place. </li>
/// </ol>
class UnorderedMapInsertResult
{
private:
enum Status{
SUCCESS = 1u << 31
, EXISTING = 1u << 30
, FREED_EXISTING = 1u << 29
, LIST_LENGTH_MASK = ~(SUCCESS | EXISTING | FREED_EXISTING)
};
public:
/// Did the map successful insert the key/value pair
KOKKOS_FORCEINLINE_FUNCTION
bool success() const { return (m_status & SUCCESS); }
/// Was the key already present in the map
KOKKOS_FORCEINLINE_FUNCTION
bool existing() const { return (m_status & EXISTING); }
/// Did the map fail to insert the key due to insufficent capacity
KOKKOS_FORCEINLINE_FUNCTION
bool failed() const { return m_index == UnorderedMapInvalidIndex; }
/// Did the map lose a race condition to insert a dupulicate key/value pair
/// where an index was claimed that needed to be released
KOKKOS_FORCEINLINE_FUNCTION
bool freed_existing() const { return (m_status & FREED_EXISTING); }
/// How many iterations through the insert loop did it take before the
/// map returned
KOKKOS_FORCEINLINE_FUNCTION
uint32_t list_position() const { return (m_status & LIST_LENGTH_MASK); }
/// Index where the key can be found as long as the insert did not fail
KOKKOS_FORCEINLINE_FUNCTION
uint32_t index() const { return m_index; }
KOKKOS_FORCEINLINE_FUNCTION
UnorderedMapInsertResult()
: m_index(UnorderedMapInvalidIndex)
, m_status(0)
{}
KOKKOS_FORCEINLINE_FUNCTION
void increment_list_position()
{
m_status += (list_position() < LIST_LENGTH_MASK) ? 1u : 0u;
}
KOKKOS_FORCEINLINE_FUNCTION
void set_existing(uint32_t i, bool arg_freed_existing)
{
m_index = i;
m_status = EXISTING | (arg_freed_existing ? FREED_EXISTING : 0u) | list_position();
}
KOKKOS_FORCEINLINE_FUNCTION
void set_success(uint32_t i)
{
m_index = i;
m_status = SUCCESS | list_position();
}
private:
uint32_t m_index;
uint32_t m_status;
};
/// \class UnorderedMap
/// \brief Thread-safe, performance-portable lookup table.
///
/// This class provides a lookup table. In terms of functionality,
/// this class compares to std::unordered_map (new in C++11).
/// "Unordered" means that keys are not stored in any particular
/// order, unlike (for example) std::map. "Thread-safe" means that
/// lookups, insertion, and deletion are safe to call by multiple
/// threads in parallel. "Performance-portable" means that parallel
/// performance of these operations is reasonable, on multiple
/// hardware platforms. Platforms on which performance has been
/// tested include conventional Intel x86 multicore processors, Intel
/// Xeon Phi ("MIC"), and NVIDIA GPUs.
///
/// Parallel performance portability entails design decisions that
/// might differ from one's expectation for a sequential interface.
/// This particularly affects insertion of single elements. In an
/// interface intended for sequential use, insertion might reallocate
/// memory if the original allocation did not suffice to hold the new
/// element. In this class, insertion does <i>not</i> reallocate
/// memory. This means that it might fail. insert() returns an enum
/// which indicates whether the insert failed. There are three
/// possible conditions:
/// <ol>
/// <li> <tt>INSERT_FAILED</tt>: The insert failed. This usually
/// means that the UnorderedMap ran out of space. </li>
/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
/// did <i>not</i> exist in the table before. </li>
/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
/// <i>did</i> exist in the table before. The new value was
/// ignored and the old value was left in place. </li>
/// </ol>
///
/// \tparam Key Type of keys of the lookup table. If \c const, users
/// are not allowed to add or remove keys, though they are allowed
/// to change values. In that case, the implementation may make
/// optimizations specific to the <tt>Device</tt>. For example, if
/// <tt>Device</tt> is \c Cuda, it may use texture fetches to access
/// keys.
///
/// \tparam Value Type of values stored in the lookup table. You may use
/// \c void here, in which case the table will be a set of keys. If
/// \c const, users are not allowed to change entries.
/// In that case, the implementation may make
/// optimizations specific to the \c Device, such as using texture
/// fetches to access values.
///
/// \tparam Device The Kokkos Device type.
///
/// \tparam Hasher Definition of the hash function for instances of
/// <tt>Key</tt>. The default will calculate a bitwise hash.
///
/// \tparam EqualTo Definition of the equality function for instances of
/// <tt>Key</tt>. The default will do a bitwise equality comparison.
///
template < typename Key
, typename Value
, typename Device
, typename Hasher = pod_hash<typename Impl::remove_const<Key>::type>
, typename EqualTo = pod_equal_to<typename Impl::remove_const<Key>::type>
>
class UnorderedMap
{
public:
//! \name Public types and constants
//@{
//key_types
typedef Key declared_key_type;
typedef typename Impl::remove_const<declared_key_type>::type key_type;
typedef typename Impl::add_const<key_type>::type const_key_type;
//value_types
typedef Value declared_value_type;
typedef typename Impl::remove_const<declared_value_type>::type value_type;
typedef typename Impl::add_const<value_type>::type const_value_type;
typedef Device device_type;
typedef Hasher hasher_type;
typedef EqualTo equal_to_type;
typedef uint32_t size_type;
//map_types
typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type> insertable_map_type;
typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type> modifiable_map_type;
typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type> const_map_type;
static const bool is_set = Impl::is_same<void,value_type>::value;
static const bool has_const_key = Impl::is_same<const_key_type,declared_key_type>::value;
static const bool has_const_value = is_set || Impl::is_same<const_value_type,declared_value_type>::value;
static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value);
static const bool is_modifiable_map = has_const_key && !has_const_value;
static const bool is_const_map = has_const_key && has_const_value;
typedef UnorderedMapInsertResult insert_result;
typedef typename Device::host_mirror_device_type host_mirror_device_type;
typedef UnorderedMap<Key,Value,host_mirror_device_type,Hasher,EqualTo> HostMirror;
typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type;
//@}
private:
enum { invalid_index = ~static_cast<size_type>(0) };
typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
typedef typename Impl::if_c< is_insertable_map
, View< key_type *, device_type>
, View< const key_type *, device_type, MemoryTraits<RandomAccess> >
>::type key_type_view;
typedef typename Impl::if_c< is_insertable_map || is_modifiable_map
, View< impl_value_type *, device_type>
, View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
>::type value_type_view;
typedef typename Impl::if_c< is_insertable_map
, View< size_type *, device_type>
, View< const size_type *, device_type, MemoryTraits<RandomAccess> >
>::type size_type_view;
typedef typename Impl::if_c< is_insertable_map
, Bitset< device_type >
, ConstBitset< device_type>
>::type bitset_type;
enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
enum { num_scalars = 3 };
typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;
public:
//! \name Public member functions
//@{
UnorderedMap()
: m_bounded_insert()
, m_hasher()
, m_equal_to()
, m_size()
, m_available_indexes()
, m_hash_lists()
, m_next_index()
, m_keys()
, m_values()
, m_scalars()
{}
/// \brief Constructor
///
/// \param capacity_hint [in] Initial guess of how many unique keys will be inserted into the map
/// \param hash [in] Hasher function for \c Key instances. The
/// default value usually suffices.
UnorderedMap( size_type capacity_hint, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type() )
: m_bounded_insert(true)
, m_hasher(hasher)
, m_equal_to(equal_to)
, m_size()
, m_available_indexes(calculate_capacity(capacity_hint))
, m_hash_lists(AllocateWithoutInitializing(), "UnorderedMap hash list", Impl::find_hash_size(capacity()))
, m_next_index(AllocateWithoutInitializing(), "UnorderedMap next index", capacity()+1) // +1 so that the *_at functions can always return a valid reference
, m_keys("UnorderedMap keys",capacity()+1)
, m_values("UnorderedMap values",(is_set? 1 : capacity()+1))
, m_scalars("UnorderedMap scalars")
{
if (!is_insertable_map) {
throw std::runtime_error("Cannot construct a non-insertable (i.e. const key_type) unordered_map");
}
Kokkos::deep_copy(m_hash_lists, invalid_index);
Kokkos::deep_copy(m_next_index, invalid_index);
}
void reset_failed_insert_flag()
{
reset_flag(failed_insert_idx);
}
histogram_type get_histogram()
{
return histogram_type(*this);
}
//! Clear all entries in the table.
void clear()
{
m_bounded_insert = true;
if (capacity() == 0) return;
m_available_indexes.clear();
Kokkos::deep_copy(m_hash_lists, invalid_index);
Kokkos::deep_copy(m_next_index, invalid_index);
{
const key_type tmp = key_type();
Kokkos::deep_copy(m_keys,tmp);
}
if (is_set){
const impl_value_type tmp = impl_value_type();
Kokkos::deep_copy(m_values,tmp);
}
{
Kokkos::deep_copy(m_scalars, 0);
}
}
/// \brief Change the capacity of the the map
///
/// If there are no failed inserts the current size of the map will
/// be used as a lower bound for the input capacity.
/// If the map is not empty and does not have failed inserts
/// and the capacity changes then the current data is copied
/// into the resized / rehashed map.
///
/// This is <i>not</i> a device function; it may <i>not</i> be
/// called in a parallel kernel.
bool rehash(size_type requested_capacity = 0)
{
const bool bounded_insert = (capacity() == 0) || (size() == 0u);
return rehash(requested_capacity, bounded_insert );
}
bool rehash(size_type requested_capacity, bool bounded_insert)
{
if(!is_insertable_map) return false;
const size_type curr_size = size();
requested_capacity = (requested_capacity < curr_size) ? curr_size : requested_capacity;
insertable_map_type tmp(requested_capacity, m_hasher, m_equal_to);
if (curr_size) {
tmp.m_bounded_insert = false;
Impl::UnorderedMapRehash<insertable_map_type> f(tmp,*this);
f.apply();
}
tmp.m_bounded_insert = bounded_insert;
*this = tmp;
return true;
}
/// \brief The number of entries in the table.
///
/// This method has undefined behavior when erasable() is true.
///
/// Note that this is not a device function; it cannot be called in
/// a parallel kernel. The value is not stored as a variable; it
/// must be computed.
size_type size() const
{
if( capacity() == 0u ) return 0u;
if (modified()) {
m_size = m_available_indexes.count();
reset_flag(modified_idx);
}
return m_size;
}
/// \brief The current number of failed insert() calls.
///
/// This is <i>not</i> a device function; it may <i>not</i> be
/// called in a parallel kernel. The value is not stored as a
/// variable; it must be computed.
bool failed_insert() const
{
return get_flag(failed_insert_idx);
}
bool erasable() const
{
return is_insertable_map ? get_flag(erasable_idx) : false;
}
bool begin_erase()
{
bool result = !erasable();
if (is_insertable_map && result) {
device_type::fence();
set_flag(erasable_idx);
device_type::fence();
}
return result;
}
bool end_erase()
{
bool result = erasable();
if (is_insertable_map && result) {
device_type::fence();
Impl::UnorderedMapErase<declared_map_type> f(*this);
f.apply();
device_type::fence();
reset_flag(erasable_idx);
}
return result;
}
/// \brief The maximum number of entries that the table can hold.
///
/// This <i>is</i> a device function; it may be called in a parallel
/// kernel.
KOKKOS_FORCEINLINE_FUNCTION
size_type capacity() const
{ return m_available_indexes.size(); }
/// \brief The number of hash table "buckets."
///
/// This is different than the number of entries that the table can
/// hold. Each key hashes to an index in [0, hash_capacity() - 1].
/// That index can hold zero or more entries. This class decides
/// what hash_capacity() should be, given the user's upper bound on
/// the number of entries the table must be able to hold.
///
/// This <i>is</i> a device function; it may be called in a parallel
/// kernel.
KOKKOS_INLINE_FUNCTION
size_type hash_capacity() const
{ return m_hash_lists.size(); }
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
/// This <i>is</i> a device function; it may be called in a parallel
/// kernel. As discussed in the class documentation, it need not
/// succeed. The return value tells you if it did.
///
/// \param k [in] The key to attempt to insert.
/// \param v [in] The corresponding value to attempt to insert. If
/// using this class as a set (with Value = void), then you need not
/// provide this value.
KOKKOS_INLINE_FUNCTION
insert_result insert(key_type const& k, impl_value_type const&v = impl_value_type()) const
{
insert_result result;
if ( !is_insertable_map || capacity() == 0u || m_scalars((int)erasable_idx) ) {
return result;
}
if ( !m_scalars((int)modified_idx) ) {
m_scalars((int)modified_idx) = true;
}
int volatile & failed_insert_ref = m_scalars((int)failed_insert_idx) ;
const size_type hash_value = m_hasher(k);
const size_type hash_list = hash_value % m_hash_lists.size();
size_type * curr_ptr = & m_hash_lists[ hash_list ];
size_type new_index = invalid_index ;
// Force integer multiply to long
size_type index_hint = static_cast<size_type>( (static_cast<double>(hash_list) * capacity()) / m_hash_lists.size());
size_type find_attempts = 0;
enum { bounded_find_attempts = 32u };
const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ?
bounded_find_attempts :
m_available_indexes.max_hint();
bool not_done = true ;
#if defined( __MIC__ )
#pragma noprefetch
#endif
while ( not_done ) {
// Continue searching the unordered list for this key,
// list will only be appended during insert phase.
// Need volatile_load as other threads may be appending.
size_type curr = volatile_load(curr_ptr);
KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
#if defined( __MIC__ )
#pragma noprefetch
#endif
while ( curr != invalid_index && ! m_equal_to( volatile_load(&m_keys[curr]), k) ) {
result.increment_list_position();
index_hint = curr;
curr_ptr = &m_next_index[curr];
curr = volatile_load(curr_ptr);
KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
}
//------------------------------------------------------------
// If key already present then return that index.
if ( curr != invalid_index ) {
const bool free_existing = new_index != invalid_index;
if ( free_existing ) {
// Previously claimed an unused entry that was not inserted.
// Release this unused entry immediately.
if (!m_available_indexes.reset(new_index) ) {
printf("Unable to free existing\n");
}
}
result.set_existing(curr, free_existing);
not_done = false ;
}
//------------------------------------------------------------
// Key is not currently in the map.
// If the thread has claimed an entry try to insert now.
else {
//------------------------------------------------------------
// If have not already claimed an unused entry then do so now.
if (new_index == invalid_index) {
bool found = false;
// use the hash_list as the flag for the search direction
Kokkos::tie(found, index_hint) = m_available_indexes.find_any_unset_near( index_hint, hash_list );
// found and index and this thread set it
if ( !found && ++find_attempts >= max_attempts ) {
failed_insert_ref = true;
not_done = false ;
}
else if (m_available_indexes.set(index_hint) ) {
new_index = index_hint;
// Set key and value
KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_keys[new_index]);
m_keys[new_index] = k ;
if (!is_set) {
KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_values[new_index]);
m_values[new_index] = v ;
}
// Do not proceed until key and value are updated in global memory
memory_fence();
}
}
else if (failed_insert_ref) {
not_done = false;
}
// Attempt to append claimed entry into the list.
// Another thread may also be trying to append the same list so protect with atomic.
if ( new_index != invalid_index &&
curr == atomic_compare_exchange(curr_ptr, static_cast<size_type>(invalid_index), new_index) ) {
// Succeeded in appending
result.set_success(new_index);
not_done = false ;
}
}
} // while ( not_done )
return result ;
}
KOKKOS_INLINE_FUNCTION
bool erase(key_type const& k) const
{
bool result = false;
if(is_insertable_map && 0u < capacity() && m_scalars((int)erasable_idx)) {
if ( ! m_scalars((int)modified_idx) ) {
m_scalars((int)modified_idx) = true;
}
size_type index = find(k);
if (valid_at(index)) {
m_available_indexes.reset(index);
result = true;
}
}
return result;
}
/// \brief Find the given key \c k, if it exists in the table.
///
/// \return If the key exists in the table, the index of the
/// value corresponding to that key; otherwise, an invalid index.
///
/// This <i>is</i> a device function; it may be called in a parallel
/// kernel.
KOKKOS_INLINE_FUNCTION
size_type find( const key_type & k) const
{
size_type curr = 0u < capacity() ? m_hash_lists( m_hasher(k) % m_hash_lists.size() ) : invalid_index ;
KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
while (curr != invalid_index && !m_equal_to( m_keys[curr], k) ) {
KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
curr = m_next_index[curr];
}
return curr;
}
/// \brief Does the key exist in the map
///
/// This <i>is</i> a device function; it may be called in a parallel
/// kernel.
KOKKOS_INLINE_FUNCTION
bool exists( const key_type & k) const
{
return valid_at(find(k));
}
/// \brief Get the value with \c i as its direct index.
///
/// \param i [in] Index directly into the array of entries.
///
/// This <i>is</i> a device function; it may be called in a parallel
/// kernel.
///
/// 'const value_type' via Cuda texture fetch must return by value.
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::if_c< (is_set || has_const_value), impl_value_type, impl_value_type &>::type
value_at(size_type i) const
{
return m_values[ is_set ? 0 : (i < capacity() ? i : capacity()) ];
}
/// \brief Get the key with \c i as its direct index.
///
/// \param i [in] Index directly into the array of entries.
///
/// This <i>is</i> a device function; it may be called in a parallel
/// kernel.
KOKKOS_FORCEINLINE_FUNCTION
key_type key_at(size_type i) const
{
return m_keys[ i < capacity() ? i : capacity() ];
}
KOKKOS_FORCEINLINE_FUNCTION
bool valid_at(size_type i) const
{
return m_available_indexes.test(i);
}
template <typename SKey, typename SValue>
UnorderedMap( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src,
typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value,int>::type = 0
)
: m_bounded_insert(src.m_bounded_insert)
, m_hasher(src.m_hasher)
, m_equal_to(src.m_equal_to)
, m_size(src.m_size)
, m_available_indexes(src.m_available_indexes)
, m_hash_lists(src.m_hash_lists)
, m_next_index(src.m_next_index)
, m_keys(src.m_keys)
, m_values(src.m_values)
, m_scalars(src.m_scalars)
{}
template <typename SKey, typename SValue>
typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value
,declared_map_type & >::type
operator=( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src)
{
m_bounded_insert = src.m_bounded_insert;
m_hasher = src.m_hasher;
m_equal_to = src.m_equal_to;
m_size = src.m_size;
m_available_indexes = src.m_available_indexes;
m_hash_lists = src.m_hash_lists;
m_next_index = src.m_next_index;
m_keys = src.m_keys;
m_values = src.m_values;
m_scalars = src.m_scalars;
return *this;
}
template <typename SKey, typename SValue, typename SDevice>
typename Impl::enable_if< Impl::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
Impl::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
>::type
create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src)
{
if (m_hash_lists.ptr_on_device() != src.m_hash_lists.ptr_on_device()) {
insertable_map_type tmp;
tmp.m_bounded_insert = src.m_bounded_insert;
tmp.m_hasher = src.m_hasher;
tmp.m_equal_to = src.m_equal_to;
tmp.m_size = src.size();
tmp.m_available_indexes = bitset_type( src.capacity() );
tmp.m_hash_lists = size_type_view( AllocateWithoutInitializing(), "UnorderedMap hash list", src.m_hash_lists.size() );
tmp.m_next_index = size_type_view( AllocateWithoutInitializing(), "UnorderedMap next index", src.m_next_index.size() );
tmp.m_keys = key_type_view( AllocateWithoutInitializing(), "UnorderedMap keys", src.m_keys.size() );
tmp.m_values = value_type_view( AllocateWithoutInitializing(), "UnorderedMap values", src.m_values.size() );
tmp.m_scalars = scalars_view("UnorderedMap scalars");
Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;
raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.size());
raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.size());
raw_deep_copy(tmp.m_keys.ptr_on_device(), src.m_keys.ptr_on_device(), sizeof(key_type)*src.m_keys.size());
if (!is_set) {
raw_deep_copy(tmp.m_values.ptr_on_device(), src.m_values.ptr_on_device(), sizeof(impl_value_type)*src.m_values.size());
}
raw_deep_copy(tmp.m_scalars.ptr_on_device(), src.m_scalars.ptr_on_device(), sizeof(int)*num_scalars );
*this = tmp;
}
}
//@}
private: // private member functions
bool modified() const
{
return get_flag(modified_idx);
}
void set_flag(int flag) const
{
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
const int true_ = true;
raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
}
void reset_flag(int flag) const
{
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
const int false_ = false;
raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
}
bool get_flag(int flag) const
{
typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
int result = false;
raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
return result;
}
static uint32_t calculate_capacity(uint32_t capacity_hint)
{
// increase by 16% and round to nears multiple of 128
return capacity_hint ? ((static_cast<uint32_t>(7ull*capacity_hint/6u) + 127u)/128u)*128u : 128u;
}
private: // private members
bool m_bounded_insert;
hasher_type m_hasher;
equal_to_type m_equal_to;
mutable size_type m_size;
bitset_type m_available_indexes;
size_type_view m_hash_lists;
size_type_view m_next_index;
key_type_view m_keys;
value_type_view m_values;
scalars_view m_scalars;
template <typename KKey, typename VValue, typename DDevice, typename HHash, typename EEqualTo>
friend class UnorderedMap;
template <typename UMap>
friend struct Impl::UnorderedMapErase;
template <typename UMap>
friend struct Impl::UnorderedMapHistogram;
template <typename UMap>
friend struct Impl::UnorderedMapPrint;
};
// Specialization of deep_copy for two UnorderedMap objects.
template < typename DKey, typename DT, typename DDevice
, typename SKey, typename ST, typename SDevice
, typename Hasher, typename EqualTo >
inline void deep_copy( UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> & dst
, const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> & src )
{
dst.create_copy_view(src);
}
} // namespace Kokkos
#endif //KOKKOS_UNORDERED_MAP_HPP

View File

@ -0,0 +1,282 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_VECTOR_HPP
#define KOKKOS_VECTOR_HPP
#include <KokkosCore_config.h>
#include <Kokkos_DualView.hpp>
/* Drop in replacement for std::vector based on Kokkos::DualView
* Most functions only work on the host (it will not compile if called from device kernel)
*
*/
namespace Kokkos {
template <typename Scalar, class Device=Impl::DefaultDeviceType>
class vector : public DualView<Scalar*,LayoutLeft,Device> {
public:
typedef Device device_type;
typedef Scalar value_type;
typedef Scalar* pointer;
typedef const Scalar* const_pointer;
typedef Scalar* reference;
typedef const Scalar* const_reference;
typedef Scalar* iterator;
typedef const Scalar* const_iterator;
private:
size_t _size;
typedef size_t size_type;
float _extra_storage;
typedef DualView<Scalar*,LayoutLeft,Device> DV;
public:
inline Scalar& operator() (int i) const {return DV::h_view(i);};
inline Scalar& operator[] (int i) const {return DV::h_view(i);};
/* Member functions which behave like std::vector functions */
vector():DV() {
_size = 0;
_extra_storage = 1.1;
DV::modified_host = 1;
};
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Device>("Vector",size_t(n*(1.1))) {
_size = n;
_extra_storage = 1.1;
DV::modified_host = 1;
assign(n,val);
}
void resize(size_t n) {
if(n>=capacity())
DV::resize(size_t (n*_extra_storage));
_size = n;
}
void resize(size_t n, const Scalar& val) {
assign(n,val);
}
void assign (size_t n, const Scalar& val) {
/* Resize if necessary (behavour of std:vector) */
if(n>capacity())
DV::resize(size_t (n*_extra_storage));
_size = n;
/* Assign value either on host or on device */
if( DV::modified_host >= DV::modified_device ) {
set_functor_host f(DV::h_view,val);
parallel_for(n,f);
DV::t_host::device_type::fence();
DV::modified_host++;
} else {
set_functor f(DV::d_view,val);
parallel_for(n,f);
DV::t_dev::device_type::fence();
DV::modified_device++;
}
}
void reserve(size_t n) {
DV::resize(size_t (n*_extra_storage));
}
void push_back(Scalar val) {
DV::modified_host++;
if(_size == capacity()) {
size_t new_size = _size*_extra_storage;
if(new_size == _size) new_size++;
DV::resize(new_size);
}
DV::h_view(_size) = val;
_size++;
};
void pop_back() {
_size--;
};
void clear() {
_size = 0;
}
size_type size() const {return _size;};
size_type max_size() const {return 2000000000;}
size_type capacity() const {return DV::capacity();};
bool empty() const {return _size==0;};
iterator begin() const {return &DV::h_view(0);};
iterator end() const {return &DV::h_view(_size);};
/* std::algorithms wich work originally with iterators, here they are implemented as member functions */
size_t
lower_bound (const size_t& start,
const size_t& theEnd,
const Scalar& comp_val) const
{
int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion
int upper = _size > theEnd? theEnd : _size-1; // FIXME (mfh 24 Apr 2014) narrowing conversion
if (upper <= lower) {
return theEnd;
}
Scalar lower_val = DV::h_view(lower);
Scalar upper_val = DV::h_view(upper);
size_t idx = (upper+lower)/2;
Scalar val = DV::h_view(idx);
if(val>upper_val) return upper;
if(val<lower_val) return start;
while(upper>lower) {
if(comp_val>val) {
lower = ++idx;
} else {
upper = idx;
}
idx = (upper+lower)/2;
val = DV::h_view(idx);
}
return idx;
}
bool is_sorted() {
for(int i=0;i<_size-1;i++) {
if(DV::h_view(i)>DV::h_view(i+1)) return false;
}
return true;
}
iterator find(Scalar val) const {
if(_size == 0) return end();
int upper,lower,current;
current = _size/2;
upper = _size-1;
lower = 0;
if((val<DV::h_view(0)) || (val>DV::h_view(_size-1)) ) return end();
while(upper>lower)
{
if(val>DV::h_view(current)) lower = current+1;
else upper = current;
current = (upper+lower)/2;
}
if(val==DV::h_view(current)) return &DV::h_view(current);
else return end();
}
/* Additional functions for data management */
void device_to_host(){
deep_copy(DV::h_view,DV::d_view);
}
void host_to_device() const {
deep_copy(DV::d_view,DV::h_view);
}
void on_host() {
DV::modified_host = DV::modified_device + 1;
}
void on_device() {
DV::modified_device = DV::modified_host + 1;
}
void set_overallocation(float extra) {
_extra_storage = 1.0 + extra;
}
public:
struct set_functor {
typedef typename DV::t_dev::device_type device_type;
typename DV::t_dev _data;
Scalar _val;
set_functor(typename DV::t_dev data, Scalar val) :
_data(data),_val(val) {}
KOKKOS_INLINE_FUNCTION
void operator() (const int &i) const {
_data(i) = _val;
}
};
struct set_functor_host {
typedef typename DV::t_host::device_type device_type;
typename DV::t_host _data;
Scalar _val;
set_functor_host(typename DV::t_host data, Scalar val) :
_data(data),_val(val) {}
KOKKOS_INLINE_FUNCTION
void operator() (const int &i) const {
_data(i) = _val;
}
};
};
}
#endif

View File

@ -0,0 +1,173 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_BITSET_IMPL_HPP
#define KOKKOS_BITSET_IMPL_HPP
#include <Kokkos_Macros.hpp>
#include <stdint.h>
#include <cstdio>
#include <climits>
#include <iostream>
#include <iomanip>
namespace Kokkos { namespace Impl {
KOKKOS_FORCEINLINE_FUNCTION
unsigned rotate_right(unsigned i, int r)
{
enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
return r ? ((i >> r) | (i << (size-r))) : i ;
}
KOKKOS_FORCEINLINE_FUNCTION
int bit_scan_forward(unsigned i)
{
#if defined( __CUDA_ARCH__ )
return __ffs(i) - 1;
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#elif defined( __INTEL_COMPILER )
return _bit_scan_forward(i);
#else
unsigned t = 1u;
int r = 0;
while (i && (i & t == 0))
{
t = t << 1;
++r;
}
return r;
#endif
}
KOKKOS_FORCEINLINE_FUNCTION
int bit_scan_reverse(unsigned i)
{
enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
#if defined( __CUDA_ARCH__ )
return shift - __clz(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return shift - __builtin_clz(i);
#elif defined( __INTEL_COMPILER )
return _bit_scan_reverse(i);
#else
unsigned t = 1u << shift;
int r = 0;
while (i && (i & t == 0))
{
t = t >> 1;
++r;
}
return r;
#endif
}
// count the bits set
KOKKOS_FORCEINLINE_FUNCTION
int popcount(unsigned i)
{
#if defined( __CUDA_ARCH__ )
return __popc(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#elif defined ( __INTEL_COMPILER )
return _popcnt32(i);
#else
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
i = i - ((i >> 1) & ~0u/3u); // temp
i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u); // temp
i = (i + (i >> 4)) & ~0u/255u*15u; // temp
return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
#endif
}
template <typename Bitset>
struct BitsetCount
{
typedef Bitset bitset_type;
typedef typename bitset_type::device_type device_type;
typedef typename bitset_type::size_type size_type;
typedef size_type value_type;
bitset_type m_bitset;
BitsetCount( bitset_type const& bitset)
: m_bitset(bitset)
{}
size_type apply() const
{
size_type count = 0u;
parallel_reduce(m_bitset.m_blocks.size(), *this, count);
return count;
}
KOKKOS_INLINE_FUNCTION
static void init( value_type & count)
{
count = 0u;
}
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & count, const volatile size_type & incr )
{
count += incr;
}
KOKKOS_INLINE_FUNCTION
void operator()( size_type i, value_type & count) const
{
count += popcount(m_bitset.m_blocks[i]);
}
};
}} //Kokkos::Impl
#endif // KOKKOS_BITSET_IMPL_HPP

View File

@ -0,0 +1,154 @@
#ifndef KOKKOS_FUNCTIONAL_IMPL_HPP
#define KOKKOS_FUNCTIONAL_IMPL_HPP
#include <Kokkos_Macros.hpp>
#include <stdint.h>
namespace Kokkos { namespace Impl {
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
KOKKOS_FORCEINLINE_FUNCTION
uint32_t getblock32 ( const uint8_t * p, int i )
{
// used to avoid aliasing error which could cause errors with
// forced inlining
return ((uint32_t)p[i*4+0])
| ((uint32_t)p[i*4+1] << 8)
| ((uint32_t)p[i*4+2] << 16)
| ((uint32_t)p[i*4+3] << 24);
}
KOKKOS_FORCEINLINE_FUNCTION
uint32_t rotl32 ( uint32_t x, int8_t r )
{ return (x << r) | (x >> (32 - r)); }
KOKKOS_FORCEINLINE_FUNCTION
uint32_t fmix32 ( uint32_t h )
{
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
KOKKOS_INLINE_FUNCTION
uint32_t MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed )
{
const uint8_t * data = (const uint8_t*)key;
const int nblocks = len / 4;
uint32_t h1 = seed;
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
//----------
// body
for(int i=0; i<nblocks; ++i)
{
uint32_t k1 = getblock32(data,i);
k1 *= c1;
k1 = rotl32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1,13);
h1 = h1*5+0xe6546b64;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
uint32_t k1 = 0;
switch(len & 3)
{
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;
case 1: k1 ^= tail[0];
k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= len;
h1 = fmix32(h1);
return h1;
}
#if defined( __GNUC__ ) /* GNU C */ || \
defined( __GNUG__ ) /* GNU C++ */ || \
defined( __clang__ )
#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
#else
#define KOKKOS_MAY_ALIAS
#endif
template <typename T>
KOKKOS_FORCEINLINE_FUNCTION
bool bitwise_equal(T const * const a_ptr, T const * const b_ptr)
{
typedef uint64_t KOKKOS_MAY_ALIAS T64;
typedef uint32_t KOKKOS_MAY_ALIAS T32;
typedef uint16_t KOKKOS_MAY_ALIAS T16;
typedef uint8_t KOKKOS_MAY_ALIAS T8;
enum {
NUM_8 = sizeof(T),
NUM_16 = NUM_8 / 2,
NUM_32 = NUM_8 / 4,
NUM_64 = NUM_8 / 8
};
union {
T const * const ptr;
T64 const * const ptr64;
T32 const * const ptr32;
T16 const * const ptr16;
T8 const * const ptr8;
} a = {a_ptr}, b = {b_ptr};
bool result = true;
for (int i=0; i < NUM_64; ++i) {
result = result && a.ptr64[i] == b.ptr64[i];
}
if ( NUM_64*2 < NUM_32 ) {
result = result && a.ptr32[NUM_64*2] == b.ptr32[NUM_64*2];
}
if ( NUM_32*2 < NUM_16 ) {
result = result && a.ptr16[NUM_32*2] == b.ptr16[NUM_32*2];
}
if ( NUM_16*2 < NUM_8 ) {
result = result && a.ptr8[NUM_16*2] == b.ptr8[NUM_16*2];
}
return result;
}
#undef KOKKOS_MAY_ALIAS
}} // namespace Kokkos::Impl
#endif //KOKKOS_FUNCTIONAL_IMPL_HPP

View File

@ -0,0 +1,223 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
inline
typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
{
return view ;
}
template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
inline
typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view )
{
// Force copy:
//typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type ;
typename staticcrsgraph_type::HostMirror tmp ;
typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map);
// Allocation to match:
tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
tmp.entries = create_mirror( view.entries );
// Deep copy:
deep_copy( tmp_row_map , view.row_map );
deep_copy( tmp.entries , view.entries );
return tmp ;
}
template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
inline
typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
{
return create_mirror( view );
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< class StaticCrsGraphType , class InputSizeType >
inline
typename StaticCrsGraphType::staticcrsgraph_type
create_staticcrsgraph( const std::string & label ,
const std::vector< InputSizeType > & input )
{
typedef StaticCrsGraphType output_type ;
//typedef std::vector< InputSizeType > input_type ; // unused
typedef typename output_type::entries_type entries_type ;
typedef View< typename output_type::size_type [] ,
typename output_type::array_layout ,
typename output_type::device_type > work_type ;
output_type output ;
// Create the row map:
const size_t length = input.size();
{
work_type row_work( "tmp" , length + 1 );
typename work_type::HostMirror row_work_host =
create_mirror_view( row_work );
size_t sum = 0 ;
row_work_host[0] = 0 ;
for ( size_t i = 0 ; i < length ; ++i ) {
row_work_host[i+1] = sum += input[i];
}
deep_copy( row_work , row_work_host );
output.entries = entries_type( label , sum );
output.row_map = row_work ;
}
return output ;
}
//----------------------------------------------------------------------------
template< class StaticCrsGraphType , class InputSizeType >
inline
typename StaticCrsGraphType::staticcrsgraph_type
create_staticcrsgraph( const std::string & label ,
const std::vector< std::vector< InputSizeType > > & input )
{
typedef StaticCrsGraphType output_type ;
//typedef std::vector< std::vector< InputSizeType > > input_type ; // unused
typedef typename output_type::entries_type entries_type ;
//typedef typename output_type::size_type size_type ; // unused
// mfh 14 Feb 2014: This function doesn't actually create instances
// of ok_rank, but it needs to declare the typedef in order to do
// the static "assert" (a compile-time check that the given shape
// has rank 1). In order to avoid a "declared but unused typedef"
// warning, we declare an empty instance of this type, with the
// usual "(void)" marker to avoid a compiler warning for the unused
// variable.
typedef typename
Impl::assert_shape_is_rank_one< typename entries_type::shape_type >::type
ok_rank ;
{
ok_rank thing;
(void) thing;
}
typedef View< typename output_type::size_type [] ,
typename output_type::array_layout ,
typename output_type::device_type > work_type ;
output_type output ;
// Create the row map:
const size_t length = input.size();
{
work_type row_work( "tmp" , length + 1 );
typename work_type::HostMirror row_work_host =
create_mirror_view( row_work );
size_t sum = 0 ;
row_work_host[0] = 0 ;
for ( size_t i = 0 ; i < length ; ++i ) {
row_work_host[i+1] = sum += input[i].size();
}
deep_copy( row_work , row_work_host );
output.entries = entries_type( label , sum );
output.row_map = row_work ;
}
// Fill in the entries:
{
typename entries_type::HostMirror host_entries =
create_mirror_view( output.entries );
size_t sum = 0 ;
for ( size_t i = 0 ; i < length ; ++i ) {
for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
host_entries( sum ) = input[i][j] ;
}
}
deep_copy( output.entries , host_entries );
}
return output ;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */

View File

@ -0,0 +1,101 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_UnorderedMap.hpp>
namespace Kokkos { namespace Impl {
uint32_t find_hash_size(uint32_t size)
{
if (size == 0u) return 0u;
// these primes try to preserve randomness of hash
static const uint32_t primes [] = {
3, 7, 13, 23, 53, 97, 193, 389, 769, 1543
, 2237, 2423, 2617, 2797, 2999, 3167, 3359, 3539
, 3727, 3911, 4441 , 4787 , 5119 , 5471 , 5801 , 6143 , 6521 , 6827
, 7177 , 7517 , 7853 , 8887 , 9587 , 10243 , 10937 , 11617 , 12289
, 12967 , 13649 , 14341 , 15013 , 15727
, 17749 , 19121 , 20479 , 21859 , 23209 , 24593 , 25939 , 27329
, 28669 , 30047 , 31469 , 35507 , 38231 , 40961 , 43711 , 46439
, 49157 , 51893 , 54617 , 57347 , 60077 , 62801 , 70583 , 75619
, 80669 , 85703 , 90749 , 95783 , 100823 , 105871 , 110909 , 115963
, 120997 , 126031 , 141157 , 151237 , 161323 , 171401 , 181499 , 191579
, 201653 , 211741 , 221813 , 231893 , 241979 , 252079
, 282311 , 302483 , 322649 , 342803 , 362969 , 383143 , 403301 , 423457
, 443629 , 463787 , 483953 , 504121 , 564617 , 604949 , 645313 , 685609
, 725939 , 766273 , 806609 , 846931 , 887261 , 927587 , 967919 , 1008239
, 1123477 , 1198397 , 1273289 , 1348177 , 1423067 , 1497983 , 1572869
, 1647761 , 1722667 , 1797581 , 1872461 , 1947359 , 2022253
, 2246953 , 2396759 , 2546543 , 2696363 , 2846161 , 2995973 , 3145739
, 3295541 , 3445357 , 3595117 , 3744941 , 3894707 , 4044503
, 4493921 , 4793501 , 5093089 , 5392679 , 5692279 , 5991883 , 6291469
, 6591059 , 6890641 , 7190243 , 7489829 , 7789447 , 8089033
, 8987807 , 9586981 , 10186177 , 10785371 , 11384539 , 11983729
, 12582917 , 13182109 , 13781291 , 14380469 , 14979667 , 15578861
, 16178053 , 17895707 , 19014187 , 20132683 , 21251141 , 22369661
, 23488103 , 24606583 , 25725083 , 26843549 , 27962027 , 29080529
, 30198989 , 31317469 , 32435981 , 35791397 , 38028379 , 40265327
, 42502283 , 44739259 , 46976221 , 49213237 , 51450131 , 53687099
, 55924061 , 58161041 , 60397993 , 62634959 , 64871921
, 71582857 , 76056727 , 80530643 , 85004567 , 89478503 , 93952427
, 98426347 , 102900263 , 107374217 , 111848111 , 116322053 , 120795971
, 125269877 , 129743807 , 143165587 , 152113427 , 161061283 , 170009141
, 178956983 , 187904819 , 196852693 , 205800547 , 214748383 , 223696237
, 232644089 , 241591943 , 250539763 , 259487603 , 268435399
};
const uint32_t num_primes = sizeof(primes)/sizeof(uint32_t);
uint32_t hsize = primes[num_primes-1] ;
for (uint32_t i = 0; i < num_primes; ++i) {
if (size <= primes[i]) {
hsize = primes[i];
break;
}
}
return hsize;
}
}} // namespace Kokkos::Impl

View File

@ -0,0 +1,297 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP
#define KOKKOS_UNORDERED_MAP_IMPL_HPP
#include <Kokkos_Macros.hpp>
#include <stdint.h>
#include <cstdio>
#include <climits>
#include <iostream>
#include <iomanip>
namespace Kokkos { namespace Impl {
uint32_t find_hash_size( uint32_t size );
template <typename Map>
struct UnorderedMapRehash
{
typedef Map map_type;
typedef typename map_type::const_map_type const_map_type;
typedef typename map_type::device_type device_type;
typedef typename map_type::size_type size_type;
map_type m_dst;
const_map_type m_src;
UnorderedMapRehash( map_type const& dst, const_map_type const& src)
: m_dst(dst), m_src(src)
{}
void apply() const
{
parallel_for(m_src.capacity(), *this);
}
KOKKOS_INLINE_FUNCTION
void operator()(size_type i) const
{
if ( m_src.valid_at(i) )
m_dst.insert(m_src.key_at(i), m_src.value_at(i));
}
};
template <typename UMap>
struct UnorderedMapErase
{
typedef UMap map_type;
typedef typename map_type::device_type device_type;
typedef typename map_type::size_type size_type;
typedef typename map_type::key_type key_type;
typedef typename map_type::impl_value_type value_type;
map_type m_map;
UnorderedMapErase( map_type const& map)
: m_map(map)
{}
void apply() const
{
parallel_for(m_map.m_hash_lists.size(), *this);
}
KOKKOS_INLINE_FUNCTION
void operator()( size_type i ) const
{
const size_type invalid_index = map_type::invalid_index;
size_type curr = m_map.m_hash_lists(i);
size_type next = invalid_index;
// remove erased head of the linked-list
while (curr != invalid_index && !m_map.valid_at(curr)) {
next = m_map.m_next_index[curr];
m_map.m_next_index[curr] = invalid_index;
m_map.m_keys[curr] = key_type();
if (m_map.is_set) m_map.m_values[curr] = value_type();
curr = next;
m_map.m_hash_lists(i) = next;
}
// if the list is non-empty and the head is valid
if (curr != invalid_index && m_map.valid_at(curr) ) {
size_type prev = curr;
curr = m_map.m_next_index[prev];
while (curr != invalid_index) {
next = m_map.m_next_index[curr];
if (m_map.valid_at(curr)) {
prev = curr;
}
else {
// remove curr from list
m_map.m_next_index[prev] = next;
m_map.m_next_index[curr] = invalid_index;
m_map.m_keys[curr] = key_type();
if (map_type::is_set) m_map.m_values[curr] = value_type();
}
curr = next;
}
}
}
};
template <typename UMap>
struct UnorderedMapHistogram
{
typedef UMap map_type;
typedef typename map_type::device_type device_type;
typedef typename map_type::size_type size_type;
typedef View<int[100], device_type> histogram_view;
typedef typename histogram_view::HostMirror host_histogram_view;
map_type m_map;
histogram_view m_length;
histogram_view m_distance;
histogram_view m_block_distance;
UnorderedMapHistogram( map_type const& map)
: m_map(map)
, m_length("UnorderedMap Histogram")
, m_distance("UnorderedMap Histogram")
, m_block_distance("UnorderedMap Histogram")
{}
void calculate()
{
parallel_for(m_map.m_hash_lists.size(), *this);
}
void clear()
{
Kokkos::deep_copy(m_length, 0);
Kokkos::deep_copy(m_distance, 0);
Kokkos::deep_copy(m_block_distance, 0);
}
void print_length(std::ostream &out)
{
host_histogram_view host_copy = create_mirror_view(m_length);
Kokkos::deep_copy(host_copy, m_length);
for (int i=0, size = host_copy.size(); i<size; ++i)
{
out << host_copy[i] << " , ";
}
out << "\b\b\b " << std::endl;
}
void print_distance(std::ostream &out)
{
host_histogram_view host_copy = create_mirror_view(m_distance);
Kokkos::deep_copy(host_copy, m_distance);
for (int i=0, size = host_copy.size(); i<size; ++i)
{
out << host_copy[i] << " , ";
}
out << "\b\b\b " << std::endl;
}
void print_block_distance(std::ostream &out)
{
host_histogram_view host_copy = create_mirror_view(m_block_distance);
Kokkos::deep_copy(host_copy, m_block_distance);
for (int i=0, size = host_copy.size(); i<size; ++i)
{
out << host_copy[i] << " , ";
}
out << "\b\b\b " << std::endl;
}
KOKKOS_INLINE_FUNCTION
void operator()( size_type i ) const
{
const size_type invalid_index = map_type::invalid_index;
uint32_t length = 0;
size_type min_index = ~0u, max_index = 0;
for (size_type curr = m_map.m_hash_lists(i); curr != invalid_index; curr = m_map.m_next_index[curr]) {
++length;
min_index = (curr < min_index) ? curr : min_index;
max_index = (max_index < curr) ? curr : max_index;
}
size_type distance = (0u < length) ? max_index - min_index : 0u;
size_type blocks = (0u < length) ? max_index/32u - min_index/32u : 0u;
// normalize data
length = length < 100u ? length : 99u;
distance = distance < 100u ? distance : 99u;
blocks = blocks < 100u ? blocks : 99u;
if (0u < length)
{
atomic_fetch_add( &m_length(length), 1);
atomic_fetch_add( &m_distance(distance), 1);
atomic_fetch_add( &m_block_distance(blocks), 1);
}
}
};
template <typename UMap>
struct UnorderedMapPrint
{
typedef UMap map_type;
typedef typename map_type::device_type device_type;
typedef typename map_type::size_type size_type;
map_type m_map;
UnorderedMapPrint( map_type const& map)
: m_map(map)
{}
void apply()
{
parallel_for(m_map.m_hash_lists.size(), *this);
}
KOKKOS_INLINE_FUNCTION
void operator()( size_type i ) const
{
const size_type invalid_index = map_type::invalid_index;
uint32_t list = m_map.m_hash_lists(i);
for (size_type curr = list, ii=0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) {
printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), m_map.value_at(curr));
}
}
};
template <typename DKey, typename DValue, typename SKey, typename SValue>
struct UnorderedMapCanAssign : public false_ {};
template <typename Key, typename Value>
struct UnorderedMapCanAssign<Key,Value,Key,Value> : public true_ {};
template <typename Key, typename Value>
struct UnorderedMapCanAssign<const Key,Value,Key,Value> : public true_ {};
template <typename Key, typename Value>
struct UnorderedMapCanAssign<const Key,const Value,Key,Value> : public true_ {};
template <typename Key, typename Value>
struct UnorderedMapCanAssign<const Key,const Value,const Key,Value> : public true_ {};
}} //Kokkos::Impl
#endif // KOKKOS_UNORDERED_MAP_IMPL_HPP

View File

@ -0,0 +1,283 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDAEXEC_HPP
#define KOKKOS_CUDAEXEC_HPP
#include <string>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
/*--------------------------------------------------------------------------*/
#if defined( __CUDACC__ )
namespace Kokkos {
namespace Impl {
class CudaExec {
public:
__device__ inline
CudaExec( const int shmem_begin , const int shmem_end )
: m_shmem_end( shmem_end )
, m_shmem_iter( shmem_begin )
{}
__device__ inline
void * get_shmem( const int size )
{
extern __shared__ int sh[];
// m_shmem_iter is in bytes, convert to integer offsets
const int offset = m_shmem_iter >> power_of_two<sizeof(int)>::value ;
m_shmem_iter += size ;
if ( m_shmem_end < m_shmem_iter ) {
cuda_abort("Cuda::get_shmem out of memory");
}
return sh + offset ;
}
private:
const int m_shmem_end ;
int m_shmem_iter ;
};
} // namespace Impl
} // namespace Kokkos
#if defined( __CUDA_ARCH__ )
namespace Kokkos {
inline __device__
void * Cuda::get_shmem( const int size ) { return m_exec.get_shmem( size ); }
} // namespace Kokkos
#endif /* defined( __CUDA_ARCH__ ) */
#endif /* defined( __CUDACC__ ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
struct CudaTraits {
enum { WarpSize = 32 /* 0x0020 */ };
enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ };
enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ };
enum { UpperBoundGridCount = 65535 /* Hard upper bound */ };
enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ };
typedef unsigned long
ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_count( CudaSpace::size_type i )
{ return ( i + WarpIndexMask ) >> WarpIndexShift ; }
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_align( CudaSpace::size_type i )
{
enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
return ( i + WarpIndexMask ) & Mask ;
}
};
//----------------------------------------------------------------------------
CudaSpace::size_type cuda_internal_maximum_warp_count();
CudaSpace::size_type cuda_internal_maximum_grid_count();
CudaSpace::size_type cuda_internal_maximum_shared_words();
CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ )
/** \brief Access to constant memory on the device */
__device__ __constant__
Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
kokkos_impl_cuda_constant_memory_buffer ;
template< typename T >
inline
__device__
T * kokkos_impl_cuda_shared_memory()
{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize L1 cache and minimize shared memory:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template< class DriverType >
__global__
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType >
__global__
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template < class DriverType ,
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
struct CudaParallelLaunch ;
template < class DriverType >
struct CudaParallelLaunch< DriverType , true > {
inline
CudaParallelLaunch( const DriverType & driver ,
const dim3 & grid ,
const dim3 & block ,
const int shmem )
{
if ( grid.x && block.x ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
else if ( shmem ) {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
} else {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
}
// Copy functor to constant memory on the device
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>();
#if defined( KOKKOS_EXPRESSION_CHECK )
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType , false > {
inline
CudaParallelLaunch( const DriverType & driver ,
const dim3 & grid ,
const dim3 & block ,
const int shmem )
{
if ( grid.x && block.x ) {
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
else if ( shmem ) {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
} else {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
}
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem >>>( driver );
#if defined( KOKKOS_EXPRESSION_CHECK )
Kokkos::Cuda::fence();
#endif
}
}
};
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
#endif /* defined( __CUDACC__ ) */
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */

View File

@ -0,0 +1,329 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <Kokkos_Cuda.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_MemoryTracking.hpp>
#include <impl/Kokkos_Error.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
DeepCopy<HostSpace,CudaSpace>
::DeepCopy( void * dst , const void * src , size_t n )
{
CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
}
DeepCopy<CudaSpace,HostSpace>
::DeepCopy( void * dst , const void * src , size_t n )
{
CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
}
DeepCopy<CudaSpace,CudaSpace>
::DeepCopy( void * dst , const void * src , size_t n )
{
CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace {
class CudaMemoryTrackingEntry : public Impl::MemoryTrackingEntry
{
public:
void * const ptr_alloc ;
const unsigned size ;
const unsigned count ;
Impl::cuda_texture_object_type tex_obj ;
CudaMemoryTrackingEntry( const std::string & arg_label ,
const std::type_info & arg_info ,
void * const arg_ptr ,
const unsigned arg_size ,
const unsigned arg_count )
: Impl::MemoryTrackingEntry( arg_label , arg_info , arg_ptr , arg_size * arg_count )
, ptr_alloc( arg_ptr )
, size( arg_size )
, count( arg_count )
, tex_obj( 0 )
{}
~CudaMemoryTrackingEntry();
};
CudaMemoryTrackingEntry::~CudaMemoryTrackingEntry()
{
std::ostringstream oss;
bool error = false;
try {
Kokkos::Impl::cuda_device_synchronize();
}
catch(std::runtime_error & err) {
error = true;
oss << err.what() << std::endl;
}
if ( tex_obj ) {
}
try {
CUDA_SAFE_CALL( cudaFree( ptr_alloc ) );
}
catch(std::runtime_error & err) {
error = true;
oss << err.what() << std::endl;
}
if ( error ) {
std::cerr << "cudaFree( " << ptr_alloc << " ) FAILED for " ;
Impl::MemoryTrackingEntry::print( std::cerr );
std::cerr << oss.str() << std::endl;
}
}
Impl::MemoryTracking & cuda_space_singleton()
{
static Impl::MemoryTracking self("Kokkos::CudaSpace");
return self ;
}
bool cuda_space_verify_modifiable( const char * const label )
{
static const char error_in_parallel[] = "Called with HostSpace::in_parallel()" ;
static const char error_not_exists[] = "Called after return from main()" ;
const char * const error_msg =
HostSpace::in_parallel() ? error_in_parallel : (
! cuda_space_singleton().exists() ? error_not_exists : (const char *) 0 );
if ( error_msg ) {
std::cerr << "Kokkos::CudaSpace::" << label << " ERROR : " << error_msg << std::endl ;
}
return error_msg == 0 ;
}
}
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
void * CudaSpace::allocate(
const std::string & label ,
const std::type_info & scalar_type ,
const size_t scalar_size ,
const size_t scalar_count )
{
void * ptr = 0 ;
const size_t size = scalar_size * scalar_count ;
if ( cuda_space_verify_modifiable("allocate") && size ) {
try {
Kokkos::Impl::cuda_device_synchronize();
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
CUDA_SAFE_CALL( cudaMallocManaged( (void**) &ptr, size, cudaMemAttachGlobal) );
#else
CUDA_SAFE_CALL( cudaMalloc( (void**) &ptr, size) );
#endif
Kokkos::Impl::cuda_device_synchronize();
}
catch( std::runtime_error & err ) {
std::ostringstream msg ;
msg << "Kokkos::Impl::CudaSpace::allocate( "
<< label
<< " , " << scalar_type.name()
<< " , " << scalar_size
<< " , " << scalar_count
<< " ) FAILED memory allocation\n"
<< err.what();
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
cuda_space_singleton().insert(
new CudaMemoryTrackingEntry( label , scalar_type , ptr , scalar_size , scalar_count ) );
}
return ptr ;
}
void CudaSpace::increment( const void * ptr )
{
if ( cuda_space_verify_modifiable("increment") ) {
cuda_space_singleton().increment( ptr );
}
}
void CudaSpace::decrement( const void * ptr )
{
if ( cuda_space_verify_modifiable("decrement") ) {
cuda_space_singleton().decrement( ptr );
}
}
void CudaSpace::print_memory_view( std::ostream & o )
{
cuda_space_singleton().print( o , std::string(" ") );
}
//----------------------------------------------------------------------------
std::string CudaSpace::query_label( const void * p )
{
const Impl::MemoryTrackingEntry * entry =
cuda_space_singleton().query( p );
return entry ? entry->label : std::string("ERROR NOT FOUND");
}
void CudaSpace::access_error()
{
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
Kokkos::Impl::throw_runtime_exception( msg );
}
void CudaSpace::access_error( const void * const ptr )
{
std::ostringstream msg ;
msg << "Kokkos::CudaSpace::access_error:" ;
msg << " attempt to access Cuda-data labeled(" ;
msg << query_label( ptr ) ;
msg << ") from non-Cuda execution" ;
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
/*--------------------------------------------------------------------------*/
} // namespace Kokkos
#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
namespace Kokkos {
namespace Impl {
::cudaTextureObject_t
cuda_texture_object_attach(
const cudaChannelFormatDesc & desc ,
const void * const ptr )
{
if ( 0 == ptr || ! cuda_space_verify_modifiable("texture_object_attach") ) return 0 ;
const unsigned max_count = 1 << 28 ;
CudaMemoryTrackingEntry * entry =
dynamic_cast<CudaMemoryTrackingEntry *>( cuda_space_singleton().query( ptr ) );
const bool ok_found = 0 != entry ;
const bool ok_ptr = ok_found && ptr == entry->ptr_alloc ;
const bool ok_count = ok_found && entry->count < max_count ;
if ( ok_found && ok_ptr && ok_count ) {
// Can only create texture object on device architure 3.0 or better
if ( 0 == entry->tex_obj && 300 <= Cuda::device_arch() ) {
struct cudaResourceDesc resDesc ;
struct cudaTextureDesc texDesc ;
memset( & resDesc , 0 , sizeof(resDesc) );
memset( & texDesc , 0 , sizeof(texDesc) );
resDesc.resType = cudaResourceTypeLinear ;
resDesc.res.linear.desc = desc ;
resDesc.res.linear.sizeInBytes = entry->size * entry->count ;
resDesc.res.linear.devPtr = entry->ptr_alloc ;
cudaCreateTextureObject( & entry->tex_obj, & resDesc, & texDesc, NULL);
}
}
else {
std::ostringstream msg ;
msg << "CudaSpace::texture_object_attach( " << ptr << " ) FAILED: " ;
if ( ! ok_found ) {
msg << "Not View allocated" ;
}
else if ( ! ok_ptr ) {
msg << "Not the originally allocated View \"" << entry->label << "\"" ;
}
else if ( ! ok_count ) {
msg << "Cuda texture object limit exceeded "
<< max_count << " <= " << entry->count ;
}
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
return entry->tex_obj ;
}
} // namespace Impl
} // namespace Kokkos
#endif

View File

@ -0,0 +1,609 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/*--------------------------------------------------------------------------*/
/* Kokkos interfaces */
#include <Kokkos_Cuda.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_Error.hpp>
/*--------------------------------------------------------------------------*/
/* Standard 'C' libraries */
#include <stdlib.h>
/* Standard 'C++' libraries */
#include <vector>
#include <iostream>
#include <sstream>
#include <string>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace {
bool cuda_launch_blocking()
{
const char * env = getenv("CUDA_LAUNCH_BLOCKING");
if (env == 0) return false;
return atoi(env);
}
}
void cuda_device_synchronize()
{
static const bool launch_blocking = cuda_launch_blocking();
if (!launch_blocking) {
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
}
}
void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
{
std::ostringstream out ;
out << name << " error: " << cudaGetErrorString(e);
if (file) {
out << " " << file << ":" << line;
}
throw_runtime_exception( out.str() );
}
//----------------------------------------------------------------------------
// Some significant cuda device properties:
//
// cudaDeviceProp::name : Text label for device
// cudaDeviceProp::major : Device major number
// cudaDeviceProp::minor : Device minor number
// cudaDeviceProp::warpSize : number of threads per warp
// cudaDeviceProp::multiProcessorCount : number of multiprocessors
// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block
// cudaDeviceProp::totalConstMem : capacity of constant memory
// cudaDeviceProp::totalGlobalMem : capacity of global memory
// cudaDeviceProp::maxGridSize[3] : maximum grid size
//
// Section 4.4.2.4 of the CUDA Toolkit Reference Manual
//
// struct cudaDeviceProp {
// char name[256];
// size_t totalGlobalMem;
// size_t sharedMemPerBlock;
// int regsPerBlock;
// int warpSize;
// size_t memPitch;
// int maxThreadsPerBlock;
// int maxThreadsDim[3];
// int maxGridSize[3];
// size_t totalConstMem;
// int major;
// int minor;
// int clockRate;
// size_t textureAlignment;
// int deviceOverlap;
// int multiProcessorCount;
// int kernelExecTimeoutEnabled;
// int integrated;
// int canMapHostMemory;
// int computeMode;
// int concurrentKernels;
// int ECCEnabled;
// int pciBusID;
// int pciDeviceID;
// int tccDriver;
// int asyncEngineCount;
// int unifiedAddressing;
// int memoryClockRate;
// int memoryBusWidth;
// int l2CacheSize;
// int maxThreadsPerMultiProcessor;
// };
namespace {
class CudaInternalDevices {
public:
enum { MAXIMUM_DEVICE_COUNT = 8 };
struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
int m_cudaDevCount ;
CudaInternalDevices();
static const CudaInternalDevices & singleton();
};
CudaInternalDevices::CudaInternalDevices()
{
// See 'cudaSetDeviceFlags' for host-device thread interaction
// Section 4.4.2.6 of the CUDA Toolkit Reference Manual
CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
}
}
const CudaInternalDevices & CudaInternalDevices::singleton()
{
static CudaInternalDevices self ; return self ;
}
}
//----------------------------------------------------------------------------
class CudaInternal {
private:
CudaInternal( const CudaInternal & );
CudaInternal & operator = ( const CudaInternal & );
public:
typedef Cuda::size_type size_type ;
int m_cudaDev ;
unsigned m_maxWarpCount ;
unsigned m_maxBlock ;
unsigned m_maxSharedWords ;
size_type m_scratchSpaceCount ;
size_type m_scratchFlagsCount ;
size_type m_scratchUnifiedCount ;
size_type m_scratchUnifiedSupported ;
size_type * m_scratchSpace ;
size_type * m_scratchFlags ;
size_type * m_scratchUnified ;
static CudaInternal & singleton();
int verify_is_initialized( const char * const label ) const ;
int is_initialized() const
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
void initialize( int cuda_device_id );
void finalize();
void print_configuration( std::ostream & ) const ;
~CudaInternal();
CudaInternal()
: m_cudaDev( -1 )
, m_maxWarpCount( 0 )
, m_maxBlock( 0 )
, m_maxSharedWords( 0 )
, m_scratchSpaceCount( 0 )
, m_scratchFlagsCount( 0 )
, m_scratchUnifiedCount( 0 )
, m_scratchUnifiedSupported( 0 )
, m_scratchSpace( 0 )
, m_scratchFlags( 0 )
, m_scratchUnified( 0 )
{}
size_type * scratch_space( const size_type size );
size_type * scratch_flags( const size_type size );
size_type * scratch_unified( const size_type size );
};
//----------------------------------------------------------------------------
void CudaInternal::print_configuration( std::ostream & s ) const
{
const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
#if defined( KOKKOS_HAVE_CUDA )
s << "macro KOKKOS_HAVE_CUDA : defined" << std::endl ;
#endif
#if defined( CUDA_VERSION )
s << "macro CUDA_VERSION = " << CUDA_VERSION
<< " = version " << CUDA_VERSION / 1000
<< "." << ( CUDA_VERSION % 1000 ) / 10
<< std::endl ;
#endif
for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
s << "Kokkos::Cuda[ " << i << " ] "
<< dev_info.m_cudaProp[i].name
<< " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
<< ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
<< ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
if ( m_cudaDev == i ) s << " : Selected" ;
s << std::endl ;
}
}
//----------------------------------------------------------------------------
CudaInternal::~CudaInternal()
{
if ( m_scratchSpace ||
m_scratchFlags ||
m_scratchUnified ) {
std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
<< std::endl ;
std::cerr.flush();
}
m_cudaDev = -1 ;
m_maxWarpCount = 0 ;
m_maxBlock = 0 ;
m_maxSharedWords = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchUnifiedCount = 0 ;
m_scratchUnifiedSupported = 0 ;
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
m_scratchUnified = 0 ;
}
int CudaInternal::verify_is_initialized( const char * const label ) const
{
if ( m_cudaDev < 0 ) {
std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
}
return 0 <= m_cudaDev ;
}
CudaInternal & CudaInternal::singleton()
{
static CudaInternal self ;
return self ;
}
void CudaInternal::initialize( int cuda_device_id )
{
enum { WordSize = sizeof(size_type) };
if ( ! Cuda::host_mirror_device_type::is_initialized() ) {
const std::string msg("Cuda::initialize ERROR : Cuda::host_mirror_device_type is not initialized");
throw_runtime_exception( msg );
}
const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
const bool ok_id = 0 <= cuda_device_id &&
cuda_device_id < dev_info.m_cudaDevCount ;
// Need device capability 2.0 or better
const bool ok_dev = ok_id &&
( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
if ( ok_init && ok_dev ) {
const struct cudaDeviceProp & cudaProp =
dev_info.m_cudaProp[ cuda_device_id ];
m_cudaDev = cuda_device_id ;
CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
CUDA_SAFE_CALL( cudaDeviceReset() );
Kokkos::Impl::cuda_device_synchronize();
//----------------------------------
// Maximum number of warps,
// at most one warp per thread in a warp for reduction.
// HCE 2012-February :
// Found bug in CUDA 4.1 that sometimes a kernel launch would fail
// if the thread count == 1024 and a functor is passed to the kernel.
// Copying the kernel to constant memory and then launching with
// thread count == 1024 would work fine.
//
// HCE 2012-October :
// All compute capabilities support at least 16 warps (512 threads).
// However, we have found that 8 warps typically gives better performance.
m_maxWarpCount = 8 ;
// m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
m_maxWarpCount = Impl::CudaTraits::WarpSize ;
}
m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
//----------------------------------
m_maxBlock = cudaProp.maxGridSize[0] ;
//----------------------------------
m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
if ( ! m_scratchUnifiedSupported ) {
std::cout << "Kokkos::Cuda device "
<< cudaProp.name << " capability "
<< cudaProp.major << "." << cudaProp.minor
<< " does not support unified virtual address space"
<< std::endl ;
}
//----------------------------------
// Multiblock reduction uses scratch flags for counters
// and scratch space for partial reduction values.
// Allocate some initial space. This will grow as needed.
{
const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
(void) scratch_unified( 16 * sizeof(size_type) );
(void) scratch_flags( reduce_block_count * 2 * sizeof(size_type) );
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
}
}
else {
std::ostringstream msg ;
msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
if ( ! ok_init ) {
msg << " : Already initialized" ;
}
if ( ! ok_id ) {
msg << " : Device identifier out of range "
<< "[0.." << dev_info.m_cudaDevCount << "]" ;
}
else if ( ! ok_dev ) {
msg << " : Device " ;
msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
msg << "." ;
msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
msg << " has insufficient capability, required 2.0 or better" ;
}
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
}
//----------------------------------------------------------------------------
typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
enum { sizeScratchGrain = sizeof(ScratchGrain) };
Cuda::size_type *
CudaInternal::scratch_flags( const Cuda::size_type size )
{
if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
Cuda::memory_space::decrement( m_scratchFlags );
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
m_scratchFlags = (size_type *)
Cuda::memory_space::allocate(
std::string("InternalScratchFlags") ,
typeid( ScratchGrain ),
sizeof( ScratchGrain ),
m_scratchFlagsCount );
CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
}
return m_scratchFlags ;
}
Cuda::size_type *
CudaInternal::scratch_space( const Cuda::size_type size )
{
if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
Cuda::memory_space::decrement( m_scratchSpace );
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
m_scratchSpace = (size_type *)
Cuda::memory_space::allocate(
std::string("InternalScratchSpace") ,
typeid( ScratchGrain ),
sizeof( ScratchGrain ),
m_scratchSpaceCount );
}
return m_scratchSpace ;
}
Cuda::size_type *
CudaInternal::scratch_unified( const Cuda::size_type size )
{
if ( verify_is_initialized("scratch_unified") && m_scratchUnifiedSupported ) {
const bool allocate = m_scratchUnifiedCount * sizeScratchGrain < size ;
const bool deallocate = m_scratchUnified && ( 0 == size || allocate );
if ( allocate || deallocate ) {
Kokkos::Impl::cuda_device_synchronize();
}
if ( deallocate ) {
CUDA_SAFE_CALL( cudaFreeHost( m_scratchUnified ) );
m_scratchUnified = 0 ;
m_scratchUnifiedCount = 0 ;
}
if ( allocate ) {
m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
CUDA_SAFE_CALL( cudaHostAlloc( (void **)( & m_scratchUnified ) ,
m_scratchUnifiedCount * sizeScratchGrain ,
cudaHostAllocDefault ) );
}
}
return m_scratchUnified ;
}
//----------------------------------------------------------------------------
void CudaInternal::finalize()
{
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
Cuda::memory_space::decrement( m_scratchSpace );
Cuda::memory_space::decrement( m_scratchFlags );
(void) scratch_unified( 0 );
m_cudaDev = -1 ;
m_maxWarpCount = 0 ;
m_maxBlock = 0 ;
m_maxSharedWords = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
}
}
//----------------------------------------------------------------------------
Cuda::size_type cuda_internal_maximum_warp_count()
{ return CudaInternal::singleton().m_maxWarpCount ; }
Cuda::size_type cuda_internal_maximum_grid_count()
{ return CudaInternal::singleton().m_maxBlock ; }
Cuda::size_type cuda_internal_maximum_shared_words()
{ return CudaInternal::singleton().m_maxSharedWords ; }
Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_space( size ); }
Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_flags( size ); }
Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_unified( size ); }
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
Cuda::size_type Cuda::detect_device_count()
{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
int Cuda::is_initialized()
{ return Impl::CudaInternal::singleton().is_initialized(); }
void Cuda::initialize( const Cuda::SelectDevice config )
{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id ); }
std::vector<unsigned>
Cuda::detect_device_arch()
{
const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
std::vector<unsigned> output( s.m_cudaDevCount );
for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
}
return output ;
}
Cuda::size_type Cuda::device_arch()
{
const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
int dev_arch = 0 ;
if ( 0 <= dev_id ) {
const struct cudaDeviceProp & cudaProp =
Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
dev_arch = cudaProp.major * 100 + cudaProp.minor ;
}
return dev_arch ;
}
void Cuda::finalize()
{ Impl::CudaInternal::singleton().finalize(); }
void Cuda::print_configuration( std::ostream & s , const bool )
{ Impl::CudaInternal::singleton().print_configuration( s ); }
bool Cuda::sleep() { return false ; }
bool Cuda::wake() { return true ; }
void Cuda::fence()
{
Kokkos::Impl::cuda_device_synchronize();
}
unsigned Cuda::team_max()
{
return Impl::CudaInternal::singleton().m_maxWarpCount << Impl::CudaTraits::WarpIndexShift ;
}
} // namespace Kokkos
//----------------------------------------------------------------------------

View File

@ -0,0 +1,69 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_INTERNAL_HPP
#define KOKKOS_CUDA_INTERNAL_HPP
namespace Kokkos {
namespace Impl {
void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
void cuda_device_synchronize();
inline
void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
{
if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
}
}
}
#define CUDA_SAFE_CALL( call ) \
Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */

View File

@ -0,0 +1,829 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_PARALLEL_HPP
#define KOKKOS_CUDA_PARALLEL_HPP
#include <iostream>
#include <stdio.h>
#if defined( __CUDACC__ )
#include <utility>
#include <Kokkos_Parallel.hpp>
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class WorkSpec >
class ParallelFor< FunctorType , WorkSpec /* size_t */ , Cuda > {
private:
const FunctorType m_functor ;
const Cuda::size_type m_work ;
ParallelFor();
ParallelFor & operator = ( const ParallelFor & );
public:
inline
__device__
void operator()(void) const
{
const Cuda::size_type work_stride = blockDim.x * gridDim.x ;
for ( Cuda::size_type
iwork = threadIdx.x + blockDim.x * blockIdx.x ;
iwork < m_work ;
iwork += work_stride ) {
m_functor( iwork );
}
}
ParallelFor( const FunctorType & functor ,
const size_t work )
: m_functor( functor )
, m_work( work )
{
const dim3 block( CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1, 1);
const dim3 grid( std::min( ( m_work + block.x - 1 ) / block.x , cuda_internal_maximum_grid_count() ) , 1 , 1 );
CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
}
};
template< class FunctorType >
class ParallelFor< FunctorType , ParallelWorkRequest , Cuda > {
private:
const FunctorType m_functor ;
const ParallelWorkRequest m_work ;
const int m_shmem ;
ParallelFor();
ParallelFor & operator = ( const ParallelFor & );
public:
inline
__device__
void operator()(void) const
{
CudaExec exec( 0 , m_shmem );
m_functor( Cuda( exec ) );
}
ParallelFor( const FunctorType & functor ,
const ParallelWorkRequest & work )
: m_functor( functor )
, m_work( std::min( work.league_size , size_t(cuda_internal_maximum_grid_count()) ) ,
std::min( work.team_size , size_t(CudaTraits::WarpSize * cuda_internal_maximum_warp_count()) ) )
, m_shmem( FunctorShmemSize< FunctorType >::value( functor ) )
{
const dim3 grid( m_work.league_size , 1 , 1 );
const dim3 block( m_work.team_size , 1, 1 );
CudaParallelLaunch< ParallelFor >( *this , grid , block , m_shmem );
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType >
class ParallelFor< FunctorType , CudaWorkConfig , Cuda > {
public:
const FunctorType m_work_functor ;
inline
__device__
void operator()(void) const
{
Cuda::size_type iwork = threadIdx.x + blockDim.x * (
threadIdx.y + blockDim.y * (
threadIdx.z + blockDim.z * (
blockIdx.x + gridDim.x * (
blockIdx.y + gridDim.y * (
blockIdx.z )))));
m_work_functor( iwork );
}
ParallelFor( const FunctorType & functor ,
const CudaWorkConfig & work_config )
: m_work_functor( functor )
{
const dim3 grid( work_config.grid[0] ,
work_config.grid[1] ,
work_config.grid[2] );
const dim3 block( work_config.block[0] ,
work_config.block[1] ,
work_config.block[2] );
CudaParallelLaunch< ParallelFor >( *this , grid , block , work_config.shared );
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class WorkSpec >
class ParallelReduce< FunctorType , WorkSpec , Cuda >
{
public:
typedef ReduceAdapter< FunctorType > Reduce ;
typedef typename Reduce::pointer_type pointer_type ;
typedef typename Reduce::reference_type reference_type ;
typedef Cuda::size_type size_type ;
// Algorithmic constraints:
// (a) blockSize is a power of two
// (b) blockDim.x == BlockSize == 1 << BlockSizeShift
// (c) blockDim.y == blockDim.z == 1
enum { WarpCount = 8 };
enum { BlockSize = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
enum { BlockSizeShift = power_of_two< BlockSize >::value };
enum { BlockSizeMask = BlockSize - 1 };
enum { GridMaxComputeCapability_2x = 0x0ffff };
enum { GridMax = BlockSize };
const FunctorType m_functor ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
pointer_type m_host_pointer ;
size_type m_work ;
size_type m_work_per_block ;
size_type m_local_block_count ;
size_type m_global_block_begin ;
size_type m_global_block_count ;
__device__ inline
void operator()(void) const
{
extern __shared__ size_type shared_data[];
const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
{
reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );
m_functor.init( value );
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const size_type iwork_beg = blockIdx.x * m_work_per_block ;
const size_type iwork_end = iwork_beg + m_work_per_block < m_work
? iwork_beg + m_work_per_block : m_work ;
for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
m_functor( iwork , value );
}
}
// Reduce with final value at BlockSize - 1 location.
if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
shared_data , m_scratch_space , m_scratch_flags ) ) {
// This is the final block with the final result at the final threads' location
size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
}
}
ParallelReduce( const FunctorType & functor ,
const size_t nwork ,
const pointer_type result = 0 ,
const bool execute_immediately = true )
: m_functor( functor )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
, m_host_pointer( result )
, m_work( nwork )
, m_work_per_block( 0 )
, m_local_block_count( 0 )
, m_global_block_begin( 0 )
, m_global_block_count( 0 )
{
// At most 'max_grid' blocks:
const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));
// How much work per block:
m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
// How many block are really needed for this much work:
m_local_block_count = ( nwork + m_work_per_block - 1 ) / m_work_per_block ;
m_global_block_count = m_local_block_count ;
m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
if ( execute_immediately ) { execute(); }
}
inline
void execute() const
{
const dim3 grid( m_local_block_count , 1 , 1 );
const dim3 block( BlockSize , 1 , 1 );
const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( m_functor );
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
}
void wait() const
{
Cuda::fence();
if ( m_host_pointer ) {
if ( m_unified_space ) {
const int count = Reduce::value_count( m_functor );
for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = Reduce::value_size( m_functor );
DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
}
}
}
};
template< class FunctorType >
class ParallelReduce< FunctorType , ParallelWorkRequest , Cuda >
{
public:
typedef ReduceAdapter< FunctorType > Reduce ;
typedef typename Reduce::pointer_type pointer_type ;
typedef typename Reduce::reference_type reference_type ;
typedef Cuda::size_type size_type ;
// Algorithmic constraints:
// (a) blockSize is a power of two
// (b) blockDim.x == BlockSize == 1 << BlockSizeShift
// (b) blockDim.y == blockDim.z == 1
enum { WarpCount = 8 };
enum { BlockSize = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
enum { BlockSizeShift = power_of_two< BlockSize >::value };
enum { BlockSizeMask = BlockSize - 1 };
enum { GridMaxComputeCapability_2x = 0x0ffff };
enum { GridMax = BlockSize };
const FunctorType m_functor ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
pointer_type m_host_pointer ;
size_type m_shmem_begin ;
size_type m_shmem_end ;
size_type m_local_block_count ;
size_type m_global_block_begin ;
size_type m_global_block_count ;
__device__ inline
void operator()(void) const
{
extern __shared__ size_type shared_data[];
const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
{
reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );
m_functor.init( value );
CudaExec exec( m_shmem_begin , m_shmem_end );
m_functor( Cuda( exec ) , value );
}
// Reduce with final value at BlockSize - 1 location.
if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
shared_data , m_scratch_space , m_scratch_flags ) ) {
// This is the final block with the final result at the final threads' location
size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
}
}
ParallelReduce( const FunctorType & functor ,
const ParallelWorkRequest & work ,
const pointer_type result = 0 ,
const bool execute_immediately = true )
: m_functor( functor )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
, m_host_pointer( result )
, m_shmem_begin( cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor ) )
, m_shmem_end( cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor )
+ FunctorShmemSize< FunctorType >::value( functor ) )
, m_local_block_count( 0 )
, m_global_block_begin( 0 )
, m_global_block_count( 0 )
{
m_local_block_count = std::min( int(GridMax) , int(work.league_size) );
m_global_block_count = std::min( int(GridMax) , int(work.league_size) );
m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
if ( execute_immediately ) { execute(); }
}
inline
void execute() const
{
const dim3 grid( m_local_block_count , 1 , 1 );
const dim3 block( BlockSize , 1 , 1 );
CudaParallelLaunch< ParallelReduce >( *this, grid, block, m_shmem_end ); // copy to device and execute
}
void wait() const
{
Cuda::fence();
if ( m_host_pointer ) {
if ( m_unified_space ) {
const int count = Reduce::value_count( m_functor );
for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = Reduce::value_size( m_functor );
DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
}
}
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class Functor >
class MultiFunctorParallelReduceMember ;
template<>
class MultiFunctorParallelReduceMember<void>
{
private:
MultiFunctorParallelReduceMember( const MultiFunctorParallelReduceMember & );
MultiFunctorParallelReduceMember & operator = ( const MultiFunctorParallelReduceMember & );
protected:
MultiFunctorParallelReduceMember() {}
public:
virtual unsigned block_count() const = 0 ;
virtual ~MultiFunctorParallelReduceMember() {}
virtual void execute( void * const host_pointer ,
const unsigned global_block_begin ,
const unsigned global_block_count ) = 0 ;
virtual void wait() const = 0 ;
};
template< class Functor >
class MultiFunctorParallelReduceMember : public MultiFunctorParallelReduceMember<void> {
public:
ParallelReduce< Functor , size_t , Cuda > m_functor ;
MultiFunctorParallelReduceMember( const Functor & f , size_t nwork )
: MultiFunctorParallelReduceMember<void>()
, m_functor( f , nwork , 0 , false )
{}
virtual unsigned block_count() const { return m_functor.m_local_block_count ; }
virtual void execute( void * const host_pointer ,
const unsigned global_block_begin ,
const unsigned global_block_count )
{
m_functor.m_host_pointer = typename ReduceAdapter< Functor >::pointer_type(host_pointer);
m_functor.m_global_block_begin = global_block_begin ;
m_functor.m_global_block_count = global_block_count ;
m_functor.execute();
}
virtual void wait() const { m_functor.wait(); }
};
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
template<>
class MultiFunctorParallelReduce< Cuda >
{
private:
typedef std::vector< Impl::MultiFunctorParallelReduceMember<void> * > MemberVector ;
MemberVector m_functors ;
public:
MultiFunctorParallelReduce()
: m_functors()
{}
~MultiFunctorParallelReduce()
{
while ( ! m_functors.empty() ) {
delete m_functors.back();
m_functors.pop_back();
}
}
template< class FunctorType >
void push_back( const size_t work_count , const FunctorType & f )
{
m_functors.push_back( new Impl::MultiFunctorParallelReduceMember<FunctorType>( f , work_count ) );
}
void execute( void * host_pointer )
{
typename MemberVector::iterator m ;
Cuda::size_type block_count = 0 ;
for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
block_count += (*m)->block_count();
}
Cuda::size_type block_offset = 0 ;
for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
(*m)->execute( host_pointer , block_offset , block_count );
block_offset += (*m)->block_count();
}
}
void wait() const
{
if ( ! m_functors.empty() ) { (m_functors.back())->wait(); }
}
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class WorkSpec >
class ParallelScan< FunctorType , WorkSpec , Cuda >
{
public:
typedef ReduceAdapter< FunctorType > Reduce ;
typedef typename Reduce::pointer_type pointer_type ;
typedef typename Reduce::reference_type reference_type ;
typedef Cuda::size_type size_type ;
// Algorithmic constraints:
// (a) blockSize is a power of two
// (b) blockDim.x == BlockSize == 1 << BlockSizeShift
// (b) blockDim.y == blockDim.z == 1
// (c) gridDim.x <= blockDim.x * blockDim.x
// (d) gridDim.y == gridDim.z == 1
// blockDim.x must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
// gridDim.x <= blockDim.x * blockDim.x
//
// 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
enum { WarpCount = 4 };
enum { BlockSize = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
enum { BlockSizeShift = power_of_two< BlockSize >::value };
enum { BlockSizeMask = BlockSize - 1 };
enum { GridMaxComputeCapability_2x = 0x0ffff };
enum { GridMax = ( BlockSize * BlockSize ) < GridMaxComputeCapability_2x
? ( BlockSize * BlockSize ) : GridMaxComputeCapability_2x };
const FunctorType m_functor ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
const size_type m_work ;
size_type m_work_per_block ;
size_type m_final ;
//----------------------------------------
__device__ inline
void initial(void) const
{
extern __shared__ size_type shared_data[];
const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
size_type * const shared_value = shared_data + word_count.value * threadIdx.x ;
m_functor.init( Reduce::reference( shared_value ) );
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const size_type iwork_beg = blockIdx.x * m_work_per_block ;
const size_type iwork_end = iwork_beg + m_work_per_block < m_work
? iwork_beg + m_work_per_block : m_work ;
for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
m_functor( iwork , Reduce::reference( shared_value ) , false );
}
// Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
// Blocks' scan values are written to 'blockIdx.x' location.
// Block-groups' scan values are at: i = ( j * BlockSize - 1 ) for i < gridDim.x
cuda_single_inter_block_reduce_scan<true,BlockSize>( m_functor , blockIdx.x , gridDim.x , shared_data , m_scratch_space , m_scratch_flags );
}
//----------------------------------------
__device__ inline
void final(void) const
{
extern __shared__ size_type shared_data[];
const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
size_type * const shared_prefix = shared_data + word_count.value * threadIdx.x ;
size_type * const shared_accum = shared_data + word_count.value * ( BlockSize + 1 );
// Starting value for this thread block is the previous block's total.
if ( blockIdx.x ) {
size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
}
else if ( 0 == threadIdx.x ) {
m_functor.init( Reduce::reference( shared_accum ) );
}
unsigned iwork_beg = blockIdx.x * m_work_per_block ;
const unsigned iwork_end = iwork_beg + m_work_per_block ;
for ( ; iwork_beg < iwork_end ; iwork_beg += BlockSize ) {
const unsigned iwork = threadIdx.x + iwork_beg ;
__syncthreads(); // Don't overwrite previous iteration values until they are used
m_functor.init( Reduce::reference( shared_prefix + word_count.value ) );
// Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) {
shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
}
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
// Call functor to accumulate inclusive scan value for this work item
if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix + word_count.value ) , false ); }
// Scan block values into locations shared_data[1..BlockSize]
cuda_intra_block_reduce_scan<true>( m_functor , Reduce::pointer_type(shared_data+word_count.value) );
{
size_type * const block_total = shared_data + word_count.value * blockDim.x ;
for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
}
// Call functor with exclusive scan value
if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix ) , true ); }
}
}
//----------------------------------------
__device__ inline
void operator()(void) const
{
if ( ! m_final ) {
initial();
}
else {
final();
}
}
ParallelScan( const FunctorType & functor ,
const size_t nwork )
: m_functor( functor )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_work( nwork )
, m_work_per_block( 0 )
, m_final( false )
{
// At most 'max_grid' blocks:
const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));
// How much work per block:
m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
// How many block are really needed for this much work:
const dim3 grid( ( nwork + m_work_per_block - 1 ) / m_work_per_block , 1 , 1 );
const dim3 block( BlockSize , 1 , 1 );
const int shmem = Reduce::value_size( functor ) * ( BlockSize + 2 );
m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * grid.x );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
m_final = false ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
m_final = true ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
}
void wait() const { Cuda::fence(); }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDA_ARCH__ )
namespace Kokkos {
namespace Impl {
template< typename Type >
struct CudaJoinFunctor {
typedef Type value_type ;
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & update ,
volatile const value_type & input )
{ update += input ; }
};
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
template< typename TypeLocal , typename TypeGlobal >
__device__ inline TypeGlobal Cuda::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
{
enum { BlockSizeMax = 512 };
__shared__ TypeGlobal base_data[ BlockSizeMax + 1 ];
__syncthreads(); // Don't write in to shared data until all threads have entered this function
if ( 0 == threadIdx.x ) { base_data[0] = 0 ; }
base_data[ threadIdx.x + 1 ] = value ;
Impl::cuda_intra_block_reduce_scan<true>( Impl::CudaJoinFunctor<TypeGlobal>() , base_data + 1 );
if ( global_accum ) {
if ( blockDim.x == threadIdx.x + 1 ) {
base_data[ blockDim.x ] = atomic_fetch_add( global_accum , base_data[ blockDim.x ] );
}
__syncthreads(); // Wait for atomic
base_data[ threadIdx.x ] += base_data[ blockDim.x ] ;
}
return base_data[ threadIdx.x ];
}
template< typename Type >
__device__ inline Type Cuda::team_scan( const Type & value )
{ return team_scan( value , (Type*) 0 ); }
} // namespace Kokkos
#else /* ! defined( __CUDA_ARCH__ ) */
namespace Kokkos {
template< typename Type > inline Type Cuda::team_scan( const Type & ) { return 0 ; }
template< typename TypeLocal , typename TypeGlobal >
inline TypeGlobal Cuda::team_scan( const TypeLocal & , TypeGlobal * const ) { return 0 ; }
} // namespace Kokkos
#endif /* ! defined( __CUDA_ARCH__ ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* defined( __CUDACC__ ) */
#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */

View File

@ -0,0 +1,267 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
#define KOKKOS_CUDA_REDUCESCAN_HPP
#if defined( __CUDACC__ )
#include <utility>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize shared memory and minimize L1 cache:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
// For 2.0 capability: 48 KB shared and 16 KB L1
//----------------------------------------------------------------------------
// Must have consistent '__shared__' statement across all device kernels.
// Since there may be more than one kernel in a file then have to make this
// a simple array of words.
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) blockDim.x is a power of two
* (b) blockDim.x <= 512
* (c) blockDim.y == blockDim.z == 1
*/
template< bool DoScan , class FunctorType >
__device__
void cuda_intra_block_reduce_scan( const FunctorType & functor ,
const typename ReduceAdapter< FunctorType >::pointer_type base_data )
{
typedef ReduceAdapter< FunctorType > Reduce ;
typedef typename Reduce::pointer_type pointer_type ;
const unsigned value_count = Reduce::value_count( functor );
const unsigned BlockSizeMask = blockDim.x - 1 ;
// Must have power of two thread count
if ( BlockSizeMask & blockDim.x ) { cuda_abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
#define BLOCK_REDUCE_STEP( R , TD , S ) \
if ( ! ( R & ((1<<(S+1))-1) ) ) \
{ functor.join( Reduce::reference(TD) , Reduce::reference(TD - (value_count<<S))); }
#define BLOCK_SCAN_STEP( TD , N , S ) \
if ( N == (1<<S) ) \
{ functor.join( Reduce::reference(TD) , Reduce::reference(TD - (value_count<<S))); }
const unsigned rtid_intra = threadIdx.x ^ BlockSizeMask ;
const pointer_type tdata_intra = base_data + value_count * threadIdx.x ;
{ // Intra-warp reduction:
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
}
__syncthreads(); // Wait for all warps to reduce
{ // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
const unsigned rtid_inter = ( threadIdx.x ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
if ( rtid_inter < blockDim.x ) {
const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
if ( DoScan ) {
int n = ( rtid_inter & 32 ) ? 32 : (
( rtid_inter & 64 ) ? 64 : (
( rtid_inter & 128 ) ? 128 : (
( rtid_inter & 256 ) ? 256 : 0 )));
if ( ! ( rtid_inter + n < blockDim.x ) ) n = 0 ;
BLOCK_SCAN_STEP(tdata_inter,n,8)
BLOCK_SCAN_STEP(tdata_inter,n,7)
BLOCK_SCAN_STEP(tdata_inter,n,6)
BLOCK_SCAN_STEP(tdata_inter,n,5)
}
}
}
__syncthreads(); // Wait for inter-warp reduce-scan to complete
if ( DoScan ) {
int n = ( rtid_intra & 1 ) ? 1 : (
( rtid_intra & 2 ) ? 2 : (
( rtid_intra & 4 ) ? 4 : (
( rtid_intra & 8 ) ? 8 : (
( rtid_intra & 16 ) ? 16 : 0 ))));
if ( ! ( rtid_intra + n < blockDim.x ) ) n = 0 ;
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,0)
}
#undef BLOCK_SCAN_STEP
#undef BLOCK_REDUCE_STEP
}
//----------------------------------------------------------------------------
/**\brief Input value-per-thread starting at 'shared_data'.
* Reduction value at last thread's location.
*
* If 'DoScan' then write blocks' scan values and block-groups' scan values.
*
* Global reduce result is in the last threads' 'shared_data' location.
*/
template< bool DoScan , unsigned ArgBlockSize , class FunctorType >
__device__
bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
const Cuda::size_type block_id ,
const Cuda::size_type block_count ,
Cuda::size_type * const shared_data ,
Cuda::size_type * const global_data ,
Cuda::size_type * const global_flags )
{
typedef Cuda::size_type size_type ;
typedef ReduceAdapter< FunctorType > Reduce ;
typedef typename Reduce::pointer_type pointer_type ;
typedef typename Reduce::reference_type reference_type ;
enum { BlockSize = ArgBlockSize };
enum { BlockSizeMask = BlockSize - 1 };
enum { BlockSizeShift = power_of_two< BlockSize >::value };
const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
word_count( Reduce::value_size( functor ) / sizeof(size_type) );
// Must have power of two thread count
if ( BlockSize != blockDim.x ) { cuda_abort("Cuda::cuda_inter_block_scan wrong blockDim.x"); }
// Reduce the accumulation for the entire block.
cuda_intra_block_reduce_scan<false>( functor , pointer_type(shared_data) );
{
// Write accumulation total to global scratch space.
// Accumulation total is the last thread's data.
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
size_type * const global = global_data + word_count.value * block_id ;
for ( size_type i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i] ; }
}
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
// If this block is not the last block to contribute to this group then the block is done.
const bool is_last_block =
! __syncthreads_or( threadIdx.x ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
if ( is_last_block ) {
const size_type b = ( long(block_count) * long(threadIdx.x) ) >> BlockSizeShift ;
const size_type e = ( long(block_count) * long( threadIdx.x + 1 ) ) >> BlockSizeShift ;
{
reference_type shared_value = Reduce::reference( shared_data + word_count.value * threadIdx.x );
functor.init( shared_value );
for ( size_type i = b ; i < e ; ++i ) {
functor.join( shared_value , Reduce::reference( global_data + word_count.value * i ) );
}
}
cuda_intra_block_reduce_scan<DoScan>( functor , pointer_type(shared_data) );
if ( DoScan ) {
size_type * const shared_value = shared_data + word_count.value * ( threadIdx.x ? threadIdx.x - 1 : BlockSize );
if ( ! threadIdx.x ) { functor.init( Reduce::reference( shared_value ) ); }
// Join previous inclusive scan value to each member
for ( size_type i = b ; i < e ; ++i ) {
size_type * const global_value = global_data + word_count.value * i ;
functor.join( Reduce::reference( shared_value ) , Reduce::reference( global_value ) );
Reduce::copy( functor , global_value , shared_value );
}
}
}
return is_last_block ;
}
template< bool DoScan , unsigned ArgBlockSize , class FunctorType >
inline
unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor )
{
return ( ArgBlockSize + 2 ) * ReduceAdapter< FunctorType >::value_size( functor );
}
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( __CUDACC__ ) */
#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */

View File

@ -0,0 +1,323 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Cuda.hpp>
namespace Kokkos {
// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
// or other GPUs. We provide a generic definition (which is trivial
// and doesn't do what it claims to do) because we don't actually use
// this function unless we are on a suitable GPU, with a suitable
// Scalar type. (For example, in the mat-vec, the "ThreadsPerRow"
// internal parameter depends both on the Device and the Scalar type,
// and it controls whether shfl_down() gets called.)
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const int& width){
return val;
}
template<>
KOKKOS_INLINE_FUNCTION
unsigned int shfl_down<unsigned int>(const unsigned int &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
unsigned int tmp1 = val;
int tmp = *reinterpret_cast<int*>(&tmp1);
tmp = __shfl_down(tmp,delta,width);
return *reinterpret_cast<unsigned int*>(&tmp);
#else
return val;
#endif
#else
return val;
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
int shfl_down<int>(const int &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
return __shfl_down(val,delta,width);
#else
return val;
#endif
#else
return val;
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
float shfl_down<float>(const float &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
return __shfl_down(val,delta,width);
#else
return val;
#endif
#else
return val;
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
double shfl_down<double>(const double &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
return __hiloint2double(hi,lo);
#else
return val;
#endif
#else
return val;
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
long int shfl_down<long int>(const long int &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const long int*>(&tmp));
#else
return val;
#endif
#else
return val;
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
unsigned long shfl_down<unsigned long>(const unsigned long &val, const int& delta, const int& width){
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const unsigned long*>(&tmp));
#else
return val;
#endif
#else
return val;
#endif
}
template<int N>
struct Vectorization<Cuda,N> {
enum {increment = N};
#ifdef __CUDA_ARCH__
KOKKOS_FORCEINLINE_FUNCTION
static int begin() { return threadIdx.x%N;}
#else
KOKKOS_FORCEINLINE_FUNCTION
static int begin() { return 0;}
#endif
KOKKOS_FORCEINLINE_FUNCTION
static int thread_rank(const Cuda &dev) {
return dev.team_rank()/increment;
}
KOKKOS_FORCEINLINE_FUNCTION
static int global_thread_rank(const Cuda &dev) {
return (dev.league_rank()*dev.team_size()+dev.team_rank())/increment;
}
KOKKOS_FORCEINLINE_FUNCTION
static bool is_lane_0(const Cuda &dev) {
return (dev.team_rank()%increment)==0;
}
template<class Scalar>
KOKKOS_INLINE_FUNCTION
static Scalar reduce(const Scalar& val) {
#ifdef __CUDA_ARCH__
__shared__ Scalar result[256];
Scalar myresult;
for(int k=0;k<blockDim.x;k+=256) {
const int tid = threadIdx.x - k;
if(tid > 0 && tid<256) {
result[tid] = val;
if ( (N > 1) && (tid%2==0) )
result[tid] += result[tid+1];
if ( (N > 2) && (tid%4==0) )
result[tid] += result[tid+2];
if ( (N > 4) && (tid%8==0) )
result[tid] += result[tid+4];
if ( (N > 8) && (tid%16==0) )
result[tid] += result[tid+8];
if ( (N > 16) && (tid%32==0) )
result[tid] += result[tid+16];
myresult = result[tid];
}
if(blockDim.x>256)
__syncthreads();
}
return myresult;
#else
return val;
#endif
}
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
KOKKOS_INLINE_FUNCTION
static int reduce(const int& val) {
int result = val;
if (N > 1)
result += shfl_down(result, 1,N);
if (N > 2)
result += shfl_down(result, 2,N);
if (N > 4)
result += shfl_down(result, 4,N);
if (N > 8)
result += shfl_down(result, 8,N);
if (N > 16)
result += shfl_down(result, 16,N);
return result;
}
KOKKOS_INLINE_FUNCTION
static unsigned int reduce(const unsigned int& val) {
unsigned int result = val;
if (N > 1)
result += shfl_down(result, 1,N);
if (N > 2)
result += shfl_down(result, 2,N);
if (N > 4)
result += shfl_down(result, 4,N);
if (N > 8)
result += shfl_down(result, 8,N);
if (N > 16)
result += shfl_down(result, 16,N);
return result;
}
KOKKOS_INLINE_FUNCTION
static long int reduce(const long int& val) {
long int result = val;
if (N > 1)
result += shfl_down(result, 1,N);
if (N > 2)
result += shfl_down(result, 2,N);
if (N > 4)
result += shfl_down(result, 4,N);
if (N > 8)
result += shfl_down(result, 8,N);
if (N > 16)
result += shfl_down(result, 16,N);
return result;
}
KOKKOS_INLINE_FUNCTION
static unsigned long int reduce(const unsigned long int& val) {
unsigned long int result = val;
if (N > 1)
result += shfl_down(result, 1,N);
if (N > 2)
result += shfl_down(result, 2,N);
if (N > 4)
result += shfl_down(result, 4,N);
if (N > 8)
result += shfl_down(result, 8,N);
if (N > 16)
result += shfl_down(result, 16,N);
return result;
}
KOKKOS_INLINE_FUNCTION
static float reduce(const float& val) {
float result = val;
if (N > 1)
result += shfl_down(result, 1,N);
if (N > 2)
result += shfl_down(result, 2,N);
if (N > 4)
result += shfl_down(result, 4,N);
if (N > 8)
result += shfl_down(result, 8,N);
if (N > 16)
result += shfl_down(result, 16,N);
return result;
}
KOKKOS_INLINE_FUNCTION
static double reduce(const double& val) {
double result = val;
if (N > 1)
result += shfl_down(result, 1,N);
if (N > 2)
result += shfl_down(result, 2,N);
if (N > 4)
result += shfl_down(result, 4,N);
if (N > 8)
result += shfl_down(result, 8,N);
if (N > 16)
result += shfl_down(result, 16,N);
return result;
}
#endif
#endif
};
}

View File

@ -0,0 +1,594 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_VIEW_HPP
#define KOKKOS_CUDA_VIEW_HPP
#include <cstring>
#if defined( __CUDACC__ )
#include <cuda_runtime.h>
#endif
#include <Kokkos_View.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_CudaTypes.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
struct AssertShapeBoundsAbort< CudaSpace >
{
KOKKOS_INLINE_FUNCTION
static void apply( const size_t /* rank */ ,
const size_t /* n0 */ , const size_t /* n1 */ ,
const size_t /* n2 */ , const size_t /* n3 */ ,
const size_t /* n4 */ , const size_t /* n5 */ ,
const size_t /* n6 */ , const size_t /* n7 */ ,
const size_t /* arg_rank */ ,
const size_t /* i0 */ , const size_t /* i1 */ ,
const size_t /* i2 */ , const size_t /* i3 */ ,
const size_t /* i4 */ , const size_t /* i5 */ ,
const size_t /* i6 */ , const size_t /* i7 */ )
{
Kokkos::cuda_abort("Kokkos::View array bounds violation");
}
};
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
// to be an 'unsigned long long'. This chould change with
// future version of Cuda and this typedef would have to
// change accordingly.
#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
typedef enable_if<
sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
::cudaTextureObject_t >::type cuda_texture_object_type ;
cuda_texture_object_type
cuda_texture_object_attach(
const cudaChannelFormatDesc & ,
const void * const );
template< typename TextureType >
inline
cuda_texture_object_type
cuda_texture_object_attach( const void * const base_view_ptr )
{
return cuda_texture_object_attach( cudaCreateChannelDesc<TextureType>() , base_view_ptr );
}
#else
typedef const void * cuda_texture_object_type ;
template< typename TextureType >
inline
cuda_texture_object_type
cuda_texture_object_attach( const void * const )
{ return 0 ; }
#endif
//----------------------------------------------------------------------------
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
template< typename T, size_t size = sizeof(T) >
struct alias_type {
typedef void type;
};
template< typename T >
struct alias_type<T,4> {
typedef int type;
};
template< typename T >
struct alias_type<T,8> {
typedef int2 type;
};
template< typename T >
struct alias_type<T,16> {
typedef int4 type;
};
template< typename ValueType, typename AliasType = typename alias_type<ValueType>::type >
struct CudaTextureFetch {
private:
cuda_texture_object_type obj ;
public:
const ValueType * ptr ;
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: obj( rhs.obj ) , ptr( rhs.ptr ) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{ obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
explicit
CudaTextureFetch( const ValueType * const base_view_ptr )
: obj( cuda_texture_object_attach<AliasType>( base_view_ptr ) )
, ptr( base_view_ptr ) {}
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
// Enable the usage of the _ldg intrinsic even in cases where texture fetches work
// Currently texture fetches are faster, but that might change in the future
#ifdef KOKKOS_USE_LDG_INTRINSIC
return _ldg(&ptr[i]);
#else
AliasType v = tex1Dfetch<AliasType>( obj , i );
return *(reinterpret_cast<ValueType*> (&v));
#endif
#else
return ptr[ i ];
#endif
}
};
template< typename ValueType >
struct CudaTextureFetch< const ValueType, void > {
private:
cuda_texture_object_type obj ;
public:
const ValueType * ptr ;
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: obj( rhs.obj ) , ptr( rhs.ptr ) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{ obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
explicit
CudaTextureFetch( ValueType * const base_view_ptr )
: obj( cuda_texture_object_attach<ValueType>( base_view_ptr ) )
, ptr( base_view_ptr ) {}
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
return _ldg(&ptr[i]);
#else
return ptr[ i ];
#endif
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
struct ViewCudaTexture {};
#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
/** \brief Replace ViewDefault specialization with Cuda texture fetch specialization
* if 'const' value type and random access.
*/
template< class ValueType , class MemoryTraits >
struct ViewSpecialize< const ValueType , void , LayoutLeft , CudaSpace , MemoryTraits >
{
typedef typename if_c< MemoryTraits::RandomAccess , ViewCudaTexture , ViewDefault >::type type ;
};
template< class ValueType , class MemoryTraits >
struct ViewSpecialize< const ValueType , void , LayoutRight , CudaSpace , MemoryTraits >
{
typedef typename if_c< MemoryTraits::RandomAccess , ViewCudaTexture , ViewDefault >::type type ;
};
#endif
//----------------------------------------------------------------------------
template<>
struct ViewAssignment< ViewCudaTexture , ViewCudaTexture , void >
{
/** \brief Assign compatible views */
template< class DT , class DL , class DD , class DM ,
class ST , class SL , class SD , class SM >
KOKKOS_INLINE_FUNCTION
ViewAssignment( View<DT,DL,DD,DM,ViewCudaTexture> & dst ,
const View<ST,SL,SD,SM,ViewCudaTexture> & src ,
const typename enable_if<(
ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
) >::type * = 0 )
{
dst.m_tracking.decrement( dst.m_texture.ptr );
dst.m_texture = src.m_texture ;
dst.m_offset_map.assign( src.m_offset_map );
dst.m_tracking = src.m_tracking ;
dst.m_tracking.increment( dst.m_texture.ptr );
}
};
template<>
struct ViewAssignment< ViewCudaTexture , ViewDefault , void >
{
/** \brief Assign compatible views */
template< class DT , class DL , class DD , class DM ,
class ST , class SL , class SD , class SM >
inline
ViewAssignment( View<DT,DL,DD,DM,ViewCudaTexture> & dst ,
const View<ST,SL,SD,SM,ViewDefault> & src ,
const typename enable_if<(
ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
ViewTraits<ST,SL,SD,SM> >::value
)>::type * = 0 )
{
dst.m_tracking.decrement( dst.m_texture.ptr );
dst.m_texture = CudaTextureFetch< typename ViewTraits<DT,DL,DD,DM>::value_type >( src.m_ptr_on_device );
dst.m_offset_map.assign( src.m_offset_map );
dst.m_tracking = src.m_tracking ;
dst.m_tracking.increment( dst.m_texture.ptr );
}
};
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< class T , class L, class D , class M >
class View< T , L , D , M , Impl::ViewCudaTexture >
: public ViewTraits< T , L , D , M >
{
public:
typedef ViewTraits< T , L , D , M > traits ;
private:
template< class , class , class > friend struct Impl::ViewAssignment ;
typedef Impl::ViewOffset< typename traits::shape_type
, typename traits::array_layout
> offset_map_type ;
Impl::CudaTextureFetch<typename traits::value_type > m_texture ;
offset_map_type m_offset_map ;
Impl::ViewTracking< traits > m_tracking ;
public:
typedef Impl::ViewCudaTexture specialize ;
typedef View< typename traits::const_data_type ,
typename traits::array_layout ,
typename traits::device_type ,
typename traits::memory_traits > const_type ;
typedef View< typename traits::non_const_data_type ,
typename traits::array_layout ,
typename traits::device_type::host_mirror_device_type ,
void > HostMirror ;
enum { Rank = traits::rank };
KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_offset_map ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
KOKKOS_INLINE_FUNCTION typename traits::size_type size() const
{
return m_offset_map.N0
* m_offset_map.N1
* m_offset_map.N2
* m_offset_map.N3
* m_offset_map.N4
* m_offset_map.N5
* m_offset_map.N6
* m_offset_map.N7
;
}
template< typename iType >
KOKKOS_INLINE_FUNCTION
typename traits::size_type dimension( const iType & i ) const
{ return Impl::dimension( m_offset_map , i ); }
//------------------------------------
View() : m_texture()
{ m_offset_map.assign(0,0,0,0,0,0,0,0); }
KOKKOS_INLINE_FUNCTION
~View() { m_tracking.decrement( m_texture.ptr ); }
View( const View & rhs )
: m_texture( rhs.m_texture )
{
m_offset_map.assign( rhs.m_offset_map );
m_tracking = rhs.m_tracking ;
m_tracking.increment( m_texture.ptr );
}
View & operator = ( const View & rhs )
{
(void)Impl::ViewAssignment< Impl::ViewCudaTexture , Impl::ViewCudaTexture >( *this , rhs );
return *this ;
}
template< class RT , class RL, class RD , class RM , class RS >
View( const View<RT,RL,RD,RM,RS> & rhs )
: m_texture(0)
{
Impl::ViewAssignment< Impl::ViewCudaTexture , RS >( *this , rhs );
}
template< class RT , class RL, class RD, class RM , class RS >
View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
{
Impl::ViewAssignment< Impl::ViewCudaTexture , RS >( *this , rhs );
return *this ;
}
template< typename TT >
explicit inline
View( TT * ptr ,
const size_t n0 = 0 ,
const size_t n1 = 0 ,
const size_t n2 = 0 ,
const size_t n3 = 0 ,
const size_t n4 = 0 ,
const size_t n5 = 0 ,
const size_t n6 = 0 ,
typename Impl::enable_if<(
Impl::is_same<TT,typename traits::value_type>::value
), const size_t >::type n7 = 0 )
: m_texture( Impl::CudaTextureFetch< typename traits::value_type >(ptr))
{
m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
m_tracking = false ;
}
//------------------------------------
KOKKOS_FORCEINLINE_FUNCTION
bool is_null() const { return 0 == m_texture.ptr ; }
//------------------------------------
// Rank = 1 access operators:
template < typename iType0 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type , traits, typename traits::array_layout, 1 , iType0 >::type
operator[] ( const iType0 & i0 ) const
{
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
return m_texture[ i0 ];
}
template < typename iType0 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type , traits , typename traits::array_layout, 1 , iType0 >::type
operator() ( const iType0 & i0 ) const
{
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
return m_texture[ i0 ];
}
template< typename iType0 , typename iType1 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type , traits, typename traits::array_layout, 2, iType0, iType1 >::type
operator() ( const iType0 & i0 , const iType1 & i1 ) const
{
KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
return m_texture[ m_offset_map(i0,i1) ];
}
template< typename iType0 , typename iType1 , typename iType2 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type ,
traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
{
KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
return m_texture[ m_offset_map(i0,i1,i2) ];
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type ,
traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
{
KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
return m_texture[ m_offset_map(i0,i1,i2,i3) ];
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type ,
traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3, iType4 >::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 ) const
{
KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
return m_texture[ m_offset_map(i0,i1,i2,i3,i4) ];
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 , typename iType5 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type ,
traits, typename traits::array_layout, 6, iType0, iType1, iType2, iType3, iType4, iType5 >::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 , const iType5 & i5 ) const
{
KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
return m_texture[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 , typename iType5 , typename iType6 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type ,
traits, typename traits::array_layout, 7, iType0, iType1, iType2, iType3, iType4, iType5, iType6 >::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
{
KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
return m_texture[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
}
template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
typename iType4 , typename iType5 , typename iType6 , typename iType7 >
KOKKOS_FORCEINLINE_FUNCTION
typename Impl::ViewEnableArrayOper< typename traits::value_type ,
traits, typename traits::array_layout, 8, iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7 >::type
operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
{
KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
return m_texture[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
}
//------------------------------------
KOKKOS_FORCEINLINE_FUNCTION
typename traits::value_type * ptr_on_device() const { return m_texture.ptr ; }
// Stride of physical storage, dimensioned to at least Rank
template< typename iType >
KOKKOS_INLINE_FUNCTION
void stride( iType * const s ) const { m_offset_map.stride(s); }
};
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

View File

@ -0,0 +1,101 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_ABORT_HPP
#define KOKKOS_CUDA_ABORT_HPP
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
#errof "Cuda version 4.1 or greater required"
#endif
#if ( __CUDA_ARCH__ < 200 )
#error "Cuda device capability 2.0 or greater required"
#endif
extern "C" {
/* Cuda runtime function, declared in <crt/device_runtime.h>
* Requires capability 2.x or better.
*/
extern __device__ void __assertfail(
const void *message,
const void *file,
unsigned int line,
const void *function,
size_t charsize);
}
namespace Kokkos {
__device__ inline
void cuda_abort( const char * const message )
{
const char empty[] = "" ;
__assertfail( (const void *) message ,
(const void *) empty ,
(unsigned int) 0 ,
(const void *) empty ,
sizeof(char) );
}
} // namespace Kokkos
#else
namespace Kokkos {
KOKKOS_INLINE_FUNCTION
void cuda_abort( const char * const ) {}
}
#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */

View File

View File

@ -0,0 +1,195 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Atomic.hpp
/// \brief Atomic functions
///
/// This header file defines prototypes for the following atomic functions:
/// - exchange
/// - compare and exchange
/// - add
///
/// Supported types include:
/// - signed and unsigned 4 and 8 byte integers
/// - float
/// - double
///
/// They are implemented through GCC compatible intrinsics, OpenMP
/// directives and native CUDA intrinsics.
///
/// Including this header file requires one of the following
/// compilers:
/// - NVCC (for CUDA device code only)
/// - GCC (for host code only)
/// - Intel (for host code only)
/// - A compiler that supports OpenMP 3.1 (for host code only)
#ifndef KOKKOS_ATOMIC_HPP
#define KOKKOS_ATOMIC_HPP
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_Traits.hpp>
//----------------------------------------------------------------------------
#if defined( __CUDA_ARCH__ )
// Compiling NVIDIA device code, must use Cuda atomics:
#define KOKKOS_ATOMICS_USE_CUDA
#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
! defined( KOKKOS_ATOMICS_USE_OMP31 )
// Compiling for non-Cuda atomic implementation has not been pre-selected.
// Choose the best implementation for the detected compiler.
// Preference: GCC, INTEL, OMP31
#if defined( __GNUC__ ) || defined( __GNUG__ ) || defined( __clang__ )
#define KOKKOS_ATOMICS_USE_GCC
#elif defined( __INTEL_COMPILER ) || defined( _CRAYC)
#define KOKKOS_ATOMICS_USE_INTEL
#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
#define KOKKOS_ATOMICS_USE_OMP31
#else
#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
#endif
#endif /* Not pre-selected atomic implementation */
//----------------------------------------------------------------------------
namespace Kokkos {
inline
const char * atomic_query_version()
{
#if defined( KOKKOS_ATOMICS_USE_CUDA )
return "KOKKOS_ATOMICS_USE_CUDA" ;
#elif defined( KOKKOS_ATOMICS_USE_GCC )
return "KOKKOS_ATOMICS_USE_GCC" ;
#elif defined( KOKKOS_ATOMICS_USE_INTEL )
return "KOKKOS_ATOMICS_USE_INTEL" ;
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
return "KOKKOS_ATOMICS_USE_OMP31" ;
#endif
}
} // namespace Kokkos
//----------------------------------------------------------------------------
// Atomic exchange
//
// template< typename T >
// T atomic_exchange( volatile T* const dest , const T val )
// { T tmp = *dest ; *dest = val ; return tmp ; }
#include "impl/Kokkos_Atomic_Exchange.hpp"
//----------------------------------------------------------------------------
// Atomic compare-and-exchange
//
// template<class T>
// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
//----------------------------------------------------------------------------
// Atomic fetch and add
//
// template<class T>
// T atomic_fetch_add(volatile T* const dest, const T val)
// { T tmp = *dest ; *dest += val ; return tmp ; }
#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
//----------------------------------------------------------------------------
// Atomic fetch and or
//
// template<class T>
// T atomic_fetch_or(volatile T* const dest, const T val)
// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
//----------------------------------------------------------------------------
// Atomic fetch and and
//
// template<class T>
// T atomic_fetch_or(volatile T* const dest, const T val)
// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
#include "impl/Kokkos_Atomic_Fetch_And.hpp"
//----------------------------------------------------------------------------
// Memory fence
//
// All loads and stores from this thread will be globally consistent before continuing
//
// void memory_fence() {...};
#include "impl/Kokkos_Memory_Fence.hpp"
//----------------------------------------------------------------------------
// Provide volatile_load and safe_load
//
// T volatile_load(T const volatile * const ptr);
//
// T const& safe_load(T const * const ptr);
// XEON PHI
// T safe_load(T const * const ptr
#include "impl/Kokkos_Volatile_Load.hpp"
#endif /* KOKKOS_ATOMIC_HPP */

View File

@ -0,0 +1,170 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CRSARRAY_HPP
#define KOKKOS_CRSARRAY_HPP
#include <string>
#include <vector>
#include <Kokkos_View.hpp>
namespace Kokkos {
/// \class CrsArray
/// \brief Compressed row storage array.
///
/// \tparam DataType The type of stored entries. If a CrsArray is
/// used as the graph of a sparse matrix, then this is usually an
/// integer type, the type of the column indices in the sparse
/// matrix.
///
/// \tparam Arg1Type The second template parameter, corresponding
/// either to the Device type (if there are no more template
/// parameters) or to the Layout type (if there is at least one more
/// template parameter).
///
/// \tparam Arg2Type The third template parameter, which if provided
/// corresponds to the Device type.
///
/// \tparam SizeType The type of row offsets. Usually the default
/// parameter suffices. However, setting a nondefault value is
/// necessary in some cases, for example, if you want to have a
/// sparse matrices with dimensions (and therefore column indices)
/// that fit in \c int, but want to store more than <tt>INT_MAX</tt>
/// entries in the sparse matrix.
///
/// A row has a range of entries:
/// <ul>
/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li>
/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
/// </ul>
template< class DataType,
class Arg1Type,
class Arg2Type = void,
typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
class CrsArray {
private:
typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
public:
typedef DataType data_type;
typedef typename traits::array_layout array_layout;
typedef typename traits::device_type device_type;
typedef SizeType size_type;
typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType > crsarray_type;
typedef CrsArray< DataType , array_layout , typename device_type::host_mirror_device_type , SizeType > HostMirror;
typedef View< const size_type* , array_layout, device_type > row_map_type;
typedef View< DataType* , array_layout, device_type > entries_type;
entries_type entries;
row_map_type row_map;
//! Construct an empty view.
CrsArray () : entries(), row_map() {}
//! Copy constructor (shallow copy).
CrsArray (const CrsArray& rhs) : entries (rhs.entries), row_map (rhs.row_map)
{}
/** \brief Assign to a view of the rhs array.
* If the old view is the last view
* then allocated memory is deallocated.
*/
CrsArray& operator= (const CrsArray& rhs) {
entries = rhs.entries;
row_map = rhs.row_map;
return *this;
}
/** \brief Destroy this view of the array.
* If the last view then allocated memory is deallocated.
*/
~CrsArray() {}
};
//----------------------------------------------------------------------------
template< class CrsArrayType , class InputSizeType >
typename CrsArrayType::crsarray_type
create_crsarray( const std::string & label ,
const std::vector< InputSizeType > & input );
template< class CrsArrayType , class InputSizeType >
typename CrsArrayType::crsarray_type
create_crsarray( const std::string & label ,
const std::vector< std::vector< InputSizeType > > & input );
//----------------------------------------------------------------------------
template< class DataType ,
class Arg1Type ,
class Arg2Type ,
typename SizeType >
typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
template< class DataType ,
class Arg1Type ,
class Arg2Type ,
typename SizeType >
typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include <impl/Kokkos_CrsArray_factory.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_CRSARRAY_HPP */

View File

@ -0,0 +1,323 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_HPP
#define KOKKOS_CUDA_HPP
#include <iosfwd>
#include <vector>
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_HAVE_OPENMP
#include <Kokkos_OpenMP.hpp>
#else
#ifdef KOKKOS_HAVE_PTHREAD
#include <Kokkos_Threads.hpp>
#else
#include <Kokkos_Serial.hpp>
#endif
#endif
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class CudaExec ;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/// \class Cuda
/// \brief Kokkos device that uses CUDA to run on GPUs.
///
/// A "device" represents a parallel execution model. It tells Kokkos
/// how to parallelize the execution of kernels in a parallel_for or
/// parallel_reduce. For example, the Threads device uses Pthreads or
/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
/// extensions, and the Serial device executes "parallel" kernels
/// sequentially. The Cuda device uses NVIDIA's CUDA programming
/// model to execute kernels in parallel on GPUs.
class Cuda {
public:
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! The device type (same as this class).
typedef Cuda device_type ;
//! This device's preferred memory space.
typedef CudaSpace memory_space ;
//! The size_type typedef best suited for this device.
typedef CudaSpace::size_type size_type ;
//! This device's preferred array layout.
typedef LayoutLeft array_layout ;
//! This device's host mirror type.
#ifdef KOKKOS_HAVE_OPENMP
typedef Kokkos::OpenMP host_mirror_device_type ;
#else
#ifdef KOKKOS_HAVE_PTHREAD
typedef Kokkos::Threads host_mirror_device_type ;
#else
typedef Kokkos::Serial host_mirror_device_type ;
#endif
#endif
//@}
//! \name Functions that all Kokkos devices must implement.
//@{
/// \brief True if and only if this method is being called in a
/// thread-parallel function.
KOKKOS_INLINE_FUNCTION static int in_parallel() {
#if defined( __CUDA_ARCH__ )
return true;
#else
return false;
#endif
}
/** \brief Set the device in a "sleep" state.
*
* This function sets the device in a "sleep" state in which it is
* not ready for work. This may consume less resources than if the
* device were in an "awake" state, but it may also take time to
* bring the device from a sleep state to be ready for work.
*
* \return True if the device is in the "sleep" state, else false if
* the device is actively working and could not enter the "sleep"
* state.
*/
static bool sleep();
/// \brief Wake the device from the 'sleep' state so it is ready for work.
///
/// \return True if the device is in the "ready" state, else "false"
/// if the device is actively working (which also means that it's
/// awake).
static bool wake();
/// \brief Wait until all dispatched functors complete.
///
/// The parallel_for or parallel_reduce dispatch of a functor may
/// return asynchronously, before the functor completes. This
/// method does not return until all dispatched functors on this
/// device have completed.
static void fence();
//! Free any resources being consumed by the device.
static void finalize();
//! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
//@}
//--------------------------------------------------------------------------
//! \name Device-specific functions
//@{
struct SelectDevice {
int cuda_device_id ;
SelectDevice() : cuda_device_id(0) {}
explicit SelectDevice( int id ) : cuda_device_id( id ) {}
};
//! Initialize, telling the CUDA run-time library which device to use.
static void initialize( const SelectDevice = SelectDevice() );
static int is_initialized();
/// \brief Cuda device architecture of the selected device.
///
/// This matches the __CUDA_ARCH__ specification.
static size_type device_arch();
//! Query device count.
static size_type detect_device_count();
/** \brief Detect the available devices and their architecture
* as defined by the __CUDA_ARCH__ specification.
*/
static std::vector<unsigned> detect_device_arch();
static unsigned team_max();
//@}
//--------------------------------------------------------------------------
#if defined( __CUDA_ARCH__ )
//! \name Functions for the functor device interface
//@{
__device__ inline int league_size() const { return gridDim.x ; }
__device__ inline int league_rank() const { return blockIdx.x ; }
__device__ inline int team_size() const { return blockDim.x ; }
__device__ inline int team_rank() const { return threadIdx.x ; }
__device__ inline void team_barrier() const { __syncthreads(); }
__device__ inline unsigned int team_barrier_count(bool value) const
{ return __syncthreads_count(value); }
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
__device__ inline Type team_scan( const Type & value );
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename TypeLocal , typename TypeGlobal >
__device__ inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
//! Get a pointer to shared memory for this team.
__device__ inline void * get_shmem( const int size );
__device__ inline Cuda( Impl::CudaExec & exec ) : m_exec(exec) {}
__device__ inline Cuda( const Cuda & rhs ) : m_exec(rhs.m_exec) {}
//@}
//--------------------------------------------------------------------------
private:
Impl::CudaExec & m_exec ;
//--------------------------------------------------------------------------
#else
int league_size() const ;
int league_rank() const ;
int team_size() const ;
int team_rank() const ;
void team_barrier() const ;
unsigned int team_barrier_count(bool) const ;
template< typename T >
inline T team_scan(const T& value);
template< typename TypeLocal , typename TypeGlobal >
inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
void * get_shmem( const int size );
Cuda( Impl::CudaExec & );
#endif
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Cuda-specific parallel work configuration */
struct CudaWorkConfig {
Cuda::size_type grid[3] ; //< Grid dimensions
Cuda::size_type block[3] ; //< Block dimensions
Cuda::size_type shared ; //< Shared memory size
CudaWorkConfig()
{
enum { WarpSize = 32 };
grid[0] = grid[1] = grid[2] = 1 ;
block[1] = block[2] = 1 ;
block[0] = 8 * WarpSize ;
shared = 0 ;
}
};
template< class FunctorType >
inline
void parallel_for( const CudaWorkConfig & work_config ,
const FunctorType & functor )
{
Impl::ParallelFor< FunctorType , CudaWorkConfig , Cuda >
( work_config , functor );
}
template< class FunctorType , class FinalizeType >
inline
void parallel_reduce( const CudaWorkConfig & work_config ,
const FunctorType & functor ,
const FinalizeType & finalize );
template< class FunctorType >
inline
typename FunctorType::value_type
parallel_reduce( const CudaWorkConfig & work_config ,
const FunctorType & functor );
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_View.hpp>
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
#endif /* #ifndef KOKKOS_CUDA_HPP */
//----------------------------------------------------------------------------

View File

@ -0,0 +1,184 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDASPACE_HPP
#define KOKKOS_CUDASPACE_HPP
#if defined( __CUDACC__ )
#include <cuda_runtime.h>
#endif
#include <iosfwd>
#include <typeinfo>
#include <string>
#include <Kokkos_Macros.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Cuda memory management */
class CudaSpace {
public:
typedef CudaSpace memory_space ;
typedef unsigned int size_type ;
/** \brief Allocate a contiguous block of memory on the Cuda device
* with size = scalar_size * scalar_count.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*
* Allocation may only occur on the master thread of the process.
*/
static void * allocate( const std::string & label ,
const std::type_info & scalar_type ,
const size_t scalar_size ,
const size_t scalar_count );
/** \brief Increment the reference count of the block of memory
* in which the input pointer resides.
*
* Reference counting only occurs on the master thread.
*/
static void increment( const void * );
/** \brief Decrement the reference count of the block of memory
* in which the input pointer resides. If the reference
* count falls to zero the memory is deallocated.
*
* Reference counting only occurs on the master thread.
*/
static void decrement( const void * );
/** \brief Print all tracked memory to the output stream. */
static void print_memory_view( std::ostream & );
/** \brief Retrieve label associated with the input pointer */
static std::string query_label( const void * );
/*--------------------------------*/
static void access_error();
static void access_error( const void * const );
/*--------------------------------*/
};
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
struct DeepCopy<HostSpace,CudaSpace> {
DeepCopy( void * dst , const void * src , size_t );
};
template<>
struct DeepCopy<CudaSpace,HostSpace> {
DeepCopy( void * dst , const void * src , size_t );
};
template<>
struct DeepCopy<CudaSpace,CudaSpace> {
DeepCopy( void * dst , const void * src , size_t );
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Cuda code accessing Cuda data is good. */
template<>
struct VerifyExecutionSpaceCanAccessDataSpace< CudaSpace , CudaSpace >
{
KOKKOS_INLINE_FUNCTION static void verify( void ) {}
KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
};
/** \brief Cuda code accessing non-Cuda data is bad. */
template<>
struct VerifyExecutionSpaceCanAccessDataSpace< CudaSpace , HostSpace >
{
KOKKOS_INLINE_FUNCTION static void verify(void)
{ Kokkos::cuda_abort("Cuda code called function restricted to HostSpace"); }
KOKKOS_INLINE_FUNCTION static void verify( const void * )
{ Kokkos::cuda_abort("Cuda code attempted to access HostSpace memory"); }
};
/** \brief Produce error message when trying to access Cuda
* memory on the host.
*/
template<>
struct VerifyExecutionSpaceCanAccessDataSpace< HostSpace , CudaSpace >
{
#ifdef KOKKOS_USE_UVM
inline static void verify( void ) { }
inline static void verify( const void * p ) { }
#else
inline static void verify( void ) { CudaSpace::access_error(); }
inline static void verify( const void * p ) { CudaSpace::access_error(p); }
#endif
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_CUDASPACE_HPP */

View File

@ -0,0 +1,139 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDATYPES_HPP
#define KOKKOS_CUDATYPES_HPP
#include <Kokkos_Macros.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ )
namespace Kokkos {
typedef ::int2 int2 ;
typedef ::int3 int3 ;
typedef ::int4 int4 ;
typedef ::float2 float2 ;
typedef ::float3 float3 ;
typedef ::float4 float4 ;
typedef ::double2 double2 ;
typedef ::double3 double3 ;
typedef ::double4 double4 ;
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#else /* NOT #if defined( __CUDACC__ ) */
namespace Kokkos {
struct int2 {
int x;
int y;
};
struct int3 {
int x;
int y;
int z;
};
struct int4 {
int x;
int y;
int z;
int w;
};
struct float2 {
float x;
float y;
};
struct float3 {
float x;
float y;
float z;
};
struct float4 {
float x;
float y;
float z;
float w;
};
struct double2 {
double x;
double y;
};
struct double3 {
double x;
double y;
double z;
};
struct double4 {
double x;
double y;
double z;
double w;
};
} // namespace Kokkos
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_CUDATYPES_HPP */

View File

@ -0,0 +1,144 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HOSTSPACE_HPP
#define KOKKOS_HOSTSPACE_HPP
#include <iosfwd>
#include <typeinfo>
#include <string>
#include <Kokkos_Macros.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_MemoryTracking.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Memory management on the host for devices */
class HostSpace {
public:
typedef HostSpace memory_space ;
typedef size_t size_type ;
/** \brief Allocate a contiguous block of memory on the Cuda device
* with size = scalar_size * scalar_count.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*
* Allocation may only occur on the master thread of the process.
*/
static void * allocate( const std::string & label ,
const std::type_info & scalar_type ,
const size_t scalar_size ,
const size_t scalar_count );
/** \brief Increment the reference count of the block of memory
* in which the input pointer resides.
*
* Reference counting only occurs on the master thread.
*/
static void increment( const void * );
/** \brief Decrement the reference count of the block of memory
* in which the input pointer resides. If the reference
* count falls to zero the memory is deallocated.
*
* Reference counting only occurs on the master thread.
*/
static void decrement( const void * );
/*--------------------------------*/
/** \brief Print all tracked memory to the output stream. */
static void print_memory_view( std::ostream & );
/** \brief Retrieve label associated with the input pointer */
static std::string query_label( const void * );
/*--------------------------------*/
/* Functions unique to the HostSpace */
static int in_parallel();
static void register_in_parallel( int (*)() );
};
//----------------------------------------------------------------------------
template< class ExecutionSpace , class DataSpace >
struct VerifyExecutionSpaceCanAccessDataSpace ;
template<>
struct VerifyExecutionSpaceCanAccessDataSpace< HostSpace , HostSpace >
{
inline static void verify(void) {}
inline static void verify(const void *) {}
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class , class > struct DeepCopy ;
template<>
struct DeepCopy<HostSpace,HostSpace> {
DeepCopy( void * dst , const void * src , size_t n );
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_HOSTSPACE_HPP */

View File

@ -0,0 +1,164 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Layout.hpp
/// \brief Declaration of various \c MemoryLayout options.
#ifndef KOKKOS_LAYOUT_HPP
#define KOKKOS_LAYOUT_HPP
#include <stddef.h>
#include <impl/Kokkos_Traits.hpp>
namespace Kokkos {
//----------------------------------------------------------------------------
/// \struct LayoutLeft
/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
/// striding of multi-indices.
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Layout left" indicates a mapping where the leftmost index i0
/// refers to contiguous access, and strides increase for dimensions
/// going right from there (i1, i2, ...). This layout imitates how
/// Fortran stores multi-dimensional arrays. For the special case of
/// a two-dimensional array, "layout left" is also called "column
/// major."
struct LayoutLeft { typedef LayoutLeft array_layout ; };
//----------------------------------------------------------------------------
/// \struct LayoutRight
/// \brief Memory layout tag indicating right-to-left (C or
/// lexigraphical scheme) striding of multi-indices.
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Right layout" indicates a mapping where the rightmost index ik
/// refers to contiguous access, and strides increase for dimensions
/// going left from there. This layout imitates how C stores
/// multi-dimensional arrays. For the special case of a
/// two-dimensional array, "layout right" is also called "row major."
struct LayoutRight { typedef LayoutRight array_layout ; };
//----------------------------------------------------------------------------
/// \struct LayoutStride
/// \brief Memory layout tag indicated arbitrarily strided
/// multi-index mapping into contiguous memory.
struct LayoutStride {
typedef LayoutStride array_layout ;
enum { MAX_RANK = 8 };
size_t dimension[ MAX_RANK ] ;
size_t stride[ MAX_RANK ] ;
/** \brief Compute strides from ordered dimensions.
*
* Values of order uniquely form the set [0..rank)
* and specify ordering of the dimensions.
* Order = {0,1,2,...} is LayoutLeft
* Order = {...,2,1,0} is LayoutRight
*/
template< typename iTypeOrder , typename iTypeDimen >
KOKKOS_INLINE_FUNCTION static
LayoutStride order_dimensions( int const rank
, iTypeOrder const * const order
, iTypeDimen const * const dimen )
{
LayoutStride tmp ;
// Verify valid rank order:
int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
for ( int r = 0 ; r < MAX_RANK ; ++r ) {
tmp.dimension[r] = 0 ;
tmp.stride[r] = 0 ;
check_input &= ~int( 1 << order[r] );
}
if ( 0 == check_input ) {
size_t n = 1 ;
for ( int r = 0 ; r < rank ; ++r ) {
tmp.stride[ order[r] ] = n ;
n *= ( tmp.dimension[r] = dimen[r] );
}
}
return tmp ;
}
};
//----------------------------------------------------------------------------
/// \struct LayoutTileLeft
/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
/// striding of multi-indices by tiles.
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Tiled layout" indicates a mapping to contiguously stored
/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
/// dimensions. Indices are LayoutLeft within each tile, and the
/// tiles themselves are arranged using LayoutLeft. Note that the
/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
/// compile-time constants. This speeds up index calculations. If
/// both tile dimensions are powers of two, Kokkos can optimize
/// further.
template < unsigned ArgN0 , unsigned ArgN1 ,
bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
Impl::is_power_of_two<ArgN1>::value )
>
struct LayoutTileLeft {
typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
enum { N0 = ArgN0 };
enum { N1 = ArgN1 };
};
} // namespace Kokkos
#endif // #ifndef KOKKOS_LAYOUT_HPP

View File

@ -0,0 +1,227 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_MACROS_HPP
#define KOKKOS_MACROS_HPP
#include <KokkosCore_config.h>
#include <impl/Kokkos_Compiler_Macros.hpp>
namespace Kokkos {
class HostSpace ;
class CudaSpace ;
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ )
// Compiling with CUDA compiler.
#if ! defined( KOKKOS_HAVE_CUDA )
#error "Compiling Kokkos with Cuda compiler but KOKKOS_HAVE_CUDA is undefined"
#endif
#include <cuda.h>
/* Compiling with a CUDA compiler for device code.
*
* Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
* CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
*
* When generating device code the __CUDA_ARCH__ macro is defined as:
* __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
*/
#if ! defined( CUDA_VERSION )
#error "#include <cuda.h> did not define CUDA_VERSION"
#endif
#if ( CUDA_VERSION < 4010 )
#error "Cuda version 4.1 or greater required"
#endif
#endif /* #if defined( __CUDACC__ ) */
//----------------------------------------------------------------------------
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
/* Compiling with CUDA compiler for device code. */
#if ( __CUDA_ARCH__ < 200 )
#error "Cuda device capability >= 2.0 is required"
#endif
#define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
#define KOKKOS_INLINE_FUNCTION __device__ __host__ inline
#define KOKKOS_FUNCTION __device__ __host__
#endif /* #if defined( __CUDACC__ ) && #if defined( __CUDA_ARCH__ ) */
//----------------------------------------------------------------------------
#if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
/* Compiling with CUDA compiler for host code. */
#define KOKKOS_FORCEINLINE_FUNCTION __forceinline__
#endif /* #if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __INTEL_COMPILER )
#if (__INTEL_COMPILER < 1200)
#define KOKKOS_DISABLE_ASM true;
#endif
/* Compiling with Intel compiler */
/* TBD: Version testing */
#ifndef KOKKOS_FORCEINLINE_FUNCTION
#define KOKKOS_FORCEINLINE_FUNCTION __forceinline
#endif
#if defined( __MIC__ )
/* Compiling with Intel compiler for execution on an Intel MIC device.
* These devices are used in no-offload mode so the HostSpace is the MIC space.
*/
#else
#ifndef KOKKOS_USE_PRAGMA_SIMD
#define KOKKOS_USE_PRAGMA_SIMD
#endif
/*
#pragma simd vectorlength(N)
#pragma ivdep
*/
#endif /* #if defined( __MIC__ ) */
#endif /* #if defined( __INTEL_COMPILER ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __GNUC__ ) /* GNU C */ || \
defined( __GNUG__ ) /* GNU C++ */
/* Compiling with GNU compiler */
#ifndef KOKKOS_FORCEINLINE_FUNCTION
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
/* Compiling with GNU compatible compiler. */
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( _OPENMP )
#if ! defined( KOKKOS_HAVE_OPENMP )
#error "Compiling Kokkos for OpenMP but KOKKOS_HAVE_OPENMP is undefined"
#endif
/* Compiling with OpenMP.
* The value of _OPENMP is an integer value YYYYMM
* where YYYY and MM are the year and month designation
* of the supported OpenMP API version.
*/
#endif /* END: #if defined( _OPENMP ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#ifndef KOKKOS_FUNCTION
#define KOKKOS_FUNCTION /* */
#endif
#ifndef KOKKOS_INLINE_FUNCTION
#define KOKKOS_INLINE_FUNCTION inline
#endif
#ifndef KOKKOS_FORCEINLINE_FUNCTION
#define KOKKOS_FORCEINLINE_FUNCTION inline
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
namespace Kokkos { typedef CudaSpace ExecutionSpace ; }
#else
namespace Kokkos { typedef HostSpace ExecutionSpace ; }
#endif
#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
Kokkos::VerifyExecutionSpaceCanAccessDataSpace< \
Kokkos::ExecutionSpace , DATA_SPACE >::verify( DATA_PTR )
#define KOKKOS_RESTRICT_EXECUTION_TO( DATA_SPACE ) \
Kokkos::VerifyExecutionSpaceCanAccessDataSpace< \
Kokkos::ExecutionSpace , DATA_SPACE >::verify()
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_MACROS_HPP */

View File

@ -0,0 +1,111 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_MEMORYTRAITS_HPP
#define KOKKOS_MEMORYTRAITS_HPP
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Memory access traits for views, an extension point.
*
* These traits should be orthogonal. If there are dependencies then
* the MemoryTraits template must detect and enforce dependencies.
*
* A zero value is the default for a View, indicating that none of
* these traits are present.
*/
enum MemoryTraitsFlags
{ Unmanaged = 0x01
, RandomAccess = 0x02
};
template < unsigned T >
struct MemoryTraits {
enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) };
enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
typedef MemoryTraits memory_traits ;
};
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
typedef Kokkos::MemoryTraits<0> MemoryManaged ;
typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** \brief Memory alignment settings
*
* Sets global value for memory alignment.
* Enable compatibility of views from different devices with static stride.
* Use compiler flag to enable overwrites.
*/
enum { MEMORY_ALIGNMENT =
#if defined( KOKKOS_MEMORY_ALIGNMENT )
KOKKOS_MEMORY_ALIGNMENT
#else
128
#endif
};
enum { MEMORY_ALIGNMENT_THRESHOLD = 4 };
} //namespace Impl
} // namespace Kokkos
#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */

View File

@ -0,0 +1,189 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos
// Manycore Performance-Portable Multidimensional Arrays
//
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMP_HPP
#define KOKKOS_OPENMP_HPP
#include <Kokkos_Macros.hpp>
#if defined(KOKKOS_HAVE_OPENMP)
#include <omp.h>
#include <cstddef>
#include <iosfwd>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class OpenMPexec ;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/// \class OpenMP
/// \brief Kokkos device for multicore processors in the host memory space.
class OpenMP {
public:
//------------------------------------
//! \name Type declarations that all Kokkos devices must provide.
//@{
typedef OpenMP device_type ;
typedef HostSpace::size_type size_type ;
typedef HostSpace memory_space ;
typedef LayoutRight array_layout ;
typedef OpenMP host_mirror_device_type ;
//@}
//------------------------------------
//! \name Functions that all Kokkos devices must implement.
//@{
inline static bool in_parallel() { return omp_in_parallel(); }
/** \brief Set the device in a "sleep" state. A noop for OpenMP. */
static bool sleep();
/** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
static bool wake();
/** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
static void fence() {}
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
/// \brief Free any resources being consumed by the device.
static void finalize();
/** \brief Initialize the device.
*
* 1) If the hardware locality library is enabled and OpenMP has not
* already bound threads then bind OpenMP threads to maximize
* core utilization and group for memory hierarchy locality.
*
* 2) Allocate a HostThread for each OpenMP thread to hold its
* topology and fan in/out data.
*/
#if 0
static void initialize( const unsigned team_count = 1 ,
const unsigned threads_per_team = 1 ,
const unsigned use_numa_count = 0 ,
const unsigned use_cores_per_numa = 0 );
#endif
static void initialize( unsigned thread_count = 0 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 );
static int is_initialized();
KOKKOS_FUNCTION static unsigned league_max();
KOKKOS_FUNCTION static unsigned team_max();
//@}
//------------------------------------
//! \name Function for the functor device interface */
//@{
KOKKOS_INLINE_FUNCTION int league_rank() const ;
KOKKOS_INLINE_FUNCTION int league_size() const ;
KOKKOS_INLINE_FUNCTION int team_rank() const ;
KOKKOS_INLINE_FUNCTION int team_size() const ;
KOKKOS_INLINE_FUNCTION void team_barrier();
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value );
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename TypeLocal , typename TypeGlobal >
KOKKOS_INLINE_FUNCTION TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
KOKKOS_INLINE_FUNCTION void * get_shmem( const int size );
explicit inline OpenMP( Impl::OpenMPexec & );
//------------------------------------
private:
Impl::OpenMPexec & m_exec ;
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
#include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
/*--------------------------------------------------------------------------*/
#endif /* #if defined(KOKKOS_HAVE_OPENMP) */
#endif /* #ifndef KOKKOS_OPENMP_HPP */

View File

@ -0,0 +1,457 @@
/// \file Kokkos_Pair.hpp
/// \brief Declaration and definition of Kokkos::pair.
///
/// This header file declares and defines Kokkos::pair and its related
/// nonmember functions.
#ifndef KOKKOS_PAIR_HPP
#define KOKKOS_PAIR_HPP
#include <Kokkos_Macros.hpp>
#include <utility>
namespace Kokkos {
/// \struct pair
/// \brief Replacement for std::pair that works on CUDA devices.
///
/// The instance methods of std::pair, including its constructors, are
/// not marked as <tt>__device__</tt> functions. Thus, they cannot be
/// called on a CUDA device, such as an NVIDIA GPU. This struct
/// implements the same interface as std::pair, but can be used on a
/// CUDA device as well as on the host.
template <class T1, class T2>
struct pair
{
//! The first template parameter of this class.
typedef T1 first_type;
//! The second template parameter of this class.
typedef T2 second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Default constructor.
///
/// This calls the default constructors of T1 and T2. It won't
/// compile if those default constructors are not defined and
/// public.
KOKKOS_FORCEINLINE_FUNCTION
pair()
: first(), second()
{}
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type const& f, second_type const& s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1, T2> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
template <class T1, class T2>
struct pair<T1&, T2&>
{
//! The first template parameter of this class.
typedef T1& first_type;
//! The second template parameter of this class.
typedef T2& second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type f, second_type s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<first_type, second_type> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
template <class T1, class T2>
struct pair<T1, T2&>
{
//! The first template parameter of this class.
typedef T1 first_type;
//! The second template parameter of this class.
typedef T2& second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type const& f, second_type s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<first_type, second_type> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
template <class T1, class T2>
struct pair<T1&, T2>
{
//! The first template parameter of this class.
typedef T1& first_type;
//! The second template parameter of this class.
typedef T2 second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type f, second_type const& s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<first_type, second_type> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
//! Equality operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return lhs.first==rhs.first && lhs.second==rhs.second; }
//! Inequality operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(lhs==rhs); }
//! Less-than operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator< (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
//! Less-than-or-equal-to operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(rhs<lhs); }
//! Greater-than operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator> (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return rhs<lhs; }
//! Greater-than-or-equal-to operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(lhs<rhs); }
/// \brief Return a new pair.
///
/// This is a "nonmember constructor" for Kokkos::pair. It works just
/// like std::make_pair.
template <class T1,class T2>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1,T2> make_pair (T1 x, T2 y)
{ return ( pair<T1,T2>(x,y) ); }
/// \brief Return a pair of references to the input arguments.
///
/// This compares to std::tie (new in C++11). You can use it to
/// assign to two variables at once, from the result of a function
/// that returns a pair. For example (<tt>__device__</tt> and
/// <tt>__host__</tt> attributes omitted for brevity):
/// \code
/// // Declaration of the function to call.
/// // First return value: operation count.
/// // Second return value: whether all operations succeeded.
/// Kokkos::pair<int, bool> someFunction ();
///
/// // Code that uses Kokkos::tie.
/// int myFunction () {
/// int count = 0;
/// bool success = false;
///
/// // This assigns to both count and success.
/// Kokkos::tie (count, success) = someFunction ();
///
/// if (! success) {
/// // ... Some operation failed;
/// // take corrective action ...
/// }
/// return count;
/// }
/// \endcode
///
/// The line that uses tie() could have been written like this:
/// \code
/// Kokkos::pair<int, bool> result = someFunction ();
/// count = result.first;
/// success = result.second;
/// \endcode
///
/// Using tie() saves two lines of code and avoids a copy of each
/// element of the pair. The latter could be significant if one or
/// both elements of the pair are more substantial objects than \c int
/// or \c bool.
template <class T1,class T2>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1 &,T2 &> tie (T1 & x, T2 & y)
{ return ( pair<T1 &,T2 &>(x,y) ); }
//
// Specialization of Kokkos::pair for a \c void second argument. This
// is not actually a "pair"; it only contains one element, the first.
//
template <class T1>
struct pair<T1,void>
{
typedef T1 first_type;
typedef void second_type;
first_type first;
enum { second = 0 };
KOKKOS_FORCEINLINE_FUNCTION
pair()
: first()
{}
KOKKOS_FORCEINLINE_FUNCTION
pair(const first_type & f)
: first(f)
{}
KOKKOS_FORCEINLINE_FUNCTION
pair(const first_type & f, int)
: first(f)
{}
template <class U>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,void> &p)
: first(p.first)
{}
template <class U>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1, void> & operator=(const pair<U,void> &p)
{
first = p.first;
return *this;
}
};
//
// Specialization of relational operators for Kokkos::pair<T1,void>.
//
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return lhs.first==rhs.first; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(lhs==rhs); }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator< (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return lhs.first<rhs.first; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(rhs<lhs); }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator> (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return rhs<lhs; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(lhs<rhs); }
} // namespace Kokkos
#endif //KOKKOS_PAIR_HPP

View File

@ -0,0 +1,765 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Parallel.hpp
/// \brief Declaration of parallel operators
#ifndef KOKKOS_PARALLEL_HPP
#define KOKKOS_PARALLEL_HPP
#include <cstddef>
#include <Kokkos_Macros.hpp>
#include <Kokkos_View.hpp>
#include <impl/Kokkos_Traits.hpp>
namespace Kokkos {
#if defined ( KOKKOS_HAVE_CUDA )
class Cuda ;
#endif
#if defined ( KOKKOS_HAVE_OPENMP )
class OpenMP ;
#endif
#if defined ( KOKKOS_HAVE_PTHREAD )
class Threads ;
#endif
#if defined ( KOKKOS_HAVE_SERIAL )
class Serial ;
#endif
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Cuda DefaultDeviceType;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef OpenMP DefaultDeviceType;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Threads DefaultDeviceType;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
!defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
typedef Serial DefaultDeviceType;
#else
#if defined ( KOKKOS_HAVE_CUDA )
typedef Kokkos::Cuda DefaultDeviceType;
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
#elif defined ( KOKKOS_HAVE_OPENMP )
typedef OpenMP DefaultDeviceType;
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
#elif defined ( KOKKOS_HAVE_PTHREAD )
typedef Threads DefaultDeviceType;
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
#else
typedef Serial DefaultDeviceType;
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
#endif
#endif
}
}
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Enable = void >
struct FunctorHasDeviceType : public false_type {};
template< class FunctorType >
struct FunctorHasDeviceType< FunctorType , typename
enable_if< ! is_same<typename FunctorType::device_type,int>::value >::type >
: public true_type {};
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/// \class ParallelFor
/// \brief Implementation of the ParallelFor operator that has a
/// partial specialization for the device.
///
/// This is an implementation detail of parallel_for. Users should
/// skip this and go directly to the nonmember function parallel_for.
template< class FunctorType ,
class WorkSpec ,
class DeviceType = typename FunctorType::device_type >
class ParallelFor ;
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
/// \class VectorParallel
/// \brief Request for parallel_for to attempt thread+vector parallelism.
struct VectorParallel
{
const size_t nwork ;
VectorParallel( const size_t n ) : nwork(n) {}
operator size_t () const { return nwork ; }
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Execute \c functor \c work_count times in parallel.
*
* A "functor" is a class containing the function to execute in
* parallel, any data needed for that execution, and a \c device_type
* typedef. Here is an example functor for parallel_for:
*
* \code
* class FunctorType {
* public:
* typedef ... device_type ;
* void operator() (IntType iwork) const ;
* };
* \endcode
*
* In the above example, \c IntType is any integer type for which a
* valid conversion from \c size_t to \c IntType exists. Its
* <tt>operator()</tt> method defines the operation to parallelize,
* over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
* This compares to a single iteration \c iwork of a \c for loop.
*/
template< class FunctorType >
inline
void parallel_for( const size_t work_count ,
const FunctorType & functor ,
typename Impl::enable_if<Impl::FunctorHasDeviceType<FunctorType>::value,int>::type = 0 )
{
Impl::ParallelFor< FunctorType , size_t > tmp( functor , work_count );
}
template< class FunctorType >
inline
void parallel_for( const size_t work_count ,
const FunctorType & functor ,
typename Impl::enable_if<!Impl::FunctorHasDeviceType<FunctorType>::value,int>::type = 0 )
{
Impl::ParallelFor< FunctorType , size_t, Impl::DefaultDeviceType >
tmp( functor , work_count );
}
/** \brief Execute \c functor \c work_count times in parallel, with vectorization.
*
* This is like parallel_for, except that it <i>mandates</i>
* vectorization as well as parallelization of the given functor. We
* emphasize "mandates": this means that the user asserts that
* vectorization is correct, and insists that the compiler vectorize.
* Mandating vectorization is not always desirable, for example if the
* body of the functor is complicated. In some cases, users might
* want to parallelize over threads, and use vectorization inside the
* parallel operation. Furthermore, the compiler might still be able
* to vectorize through a parallel_for. Thus, users should take care
* not to use this execution option arbitrarily.
*/
template< class FunctorType >
inline
void vector_parallel_for( const size_t work_count ,
const FunctorType & functor )
{
Impl::ParallelFor< FunctorType , VectorParallel > tmp( functor , work_count );
}
template< class DeviceType >
class MultiFunctorParallelFor ;
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/// \class ParallelReduce
/// \brief Implementation detail of parallel_reduce.
///
/// This is an implementation detail of parallel_reduce. Users should
/// skip this and go directly to the nonmember function parallel_reduce.
template< class FunctorType ,
class WorkSpec ,
class DeviceType = typename FunctorType::device_type >
class ParallelReduce ;
/// \class ReduceAdapter
/// \brief Implementation detail of parallel_reduce.
///
/// This is an implementation detail of parallel_reduce. Users should
/// skip this and go directly to the nonmember function parallel_reduce.
template< class FunctorType ,
class ValueType = typename FunctorType::value_type >
struct ReduceAdapter ;
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
/** \brief Parallel reduction
*
* Example of a parallel_reduce functor for a POD (plain old data) value type:
* \code
* class FunctorType { // For POD value type
* public:
* typedef ... device_type ;
* typedef <podType> value_type ;
* void operator()( <intType> iwork , <podType> & update ) const ;
* void init( <podType> & update ) const ;
* void join( volatile <podType> & update ,
* volatile const <podType> & input ) const ;
*
* typedef true_type has_final ;
* void final( <podType> & update ) const ;
* };
* \endcode
*
* Example of a parallel_reduce functor for an array of POD (plain old data) values:
* \code
* class FunctorType { // For array of POD value
* public:
* typedef ... device_type ;
* typedef <podType> value_type[] ;
* void operator()( <intType> , <podType> update[] ) const ;
* void init( <podType> update[] ) const ;
* void join( volatile <podType> update[] ,
* volatile const <podType> input[] ) const ;
*
* typedef true_type has_final ;
* void final( <podType> update[] ) const ;
* };
* \endcode
*/
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count ,
const FunctorType & functor )
{
Impl::ParallelReduce< FunctorType , size_t > reduce( functor , work_count );
}
/** \brief Parallel reduction and output to host.
*
* If FunctorType::value_type is
* - \c PodType, then \c reference_type is <tt>PodType & </tt>.
* - <tt>PodType[]</tt>, then \c reference_type is <tt>PodType * </tt>.
*/
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count ,
const FunctorType & functor ,
typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
{
Impl::ParallelReduce< FunctorType, size_t >
reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
reduce.wait();
}
template< class FunctorType >
inline
void parallel_reduce( const VectorParallel & work_count ,
const FunctorType & functor ,
typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
{
Impl::ParallelReduce< FunctorType, VectorParallel >
reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
reduce.wait();
}
template< class DeviceType >
class MultiFunctorParallelReduce ;
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/// \class ParallelScan
/// \brief Implementation detail of parallel_scan.
///
/// This is an implementation detail of parallel_scan. Users should
/// skip this and go directly to the documentation of the nonmember
/// template function Kokkos::parallel_scan.
template< class FunctorType ,
class WorkSpec ,
class DeviceType = typename FunctorType::device_type >
class ParallelScan ;
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
/// \fn parallel_scan
/// \tparam FunctorType Type of the scan functor.
///
/// \param work_count [in] Number of work items.
/// \param functor [in] The scan functor.
///
/// This function implements a parallel scan operation. The scan can
/// be either inclusive or exclusive, depending on how you implement
/// the scan functor.
///
/// A scan functor looks almost exactly like a reduce functor, except
/// that its operator() takes a third \c bool argument, \c final_pass,
/// which indicates whether this is the last pass of the scan
/// operation. We will show below how to use the \c final_pass
/// argument to control whether the scan is inclusive or exclusive.
///
/// Here is the minimum required interface of a scan functor for a POD
/// (plain old data) value type \c PodType. That is, the result is a
/// View of zero or more PodType. It is also possible for the result
/// to be an array of (same-sized) arrays of PodType, but we do not
/// show the required interface for that here.
/// \code
/// class ScanFunctor {
/// public:
/// // The Kokkos device type
/// typedef ... device_type;
/// // Type of an entry of the array containing the result;
/// // also the type of each of the entries combined using
/// // operator() or join().
/// typedef PodType value_type;
/// typedef typename DeviceType::size_type size_type;
///
/// void operator () (const size_type i, value_type& update, const bool final_pass) const;
/// void init (value_type& update) const;
/// void join (volatile value_type& update, volatile const value_type& input) const
/// };
/// \endcode
///
/// Here is an example of a functor which computes an inclusive plus-scan
/// of an array of \c int, in place. If given an array [1, 2, 3, 4], this
/// scan will overwrite that array with [1, 3, 6, 10].
///
/// \code
/// template<class DeviceType>
/// class InclScanFunctor {
/// public:
/// typedef DeviceType device_type;
/// typedef int value_type;
/// typedef typename DeviceType::size_type size_type;
///
/// InclScanFunctor (Kokkos::View<value_type*, device_type> x) : x_ (x) {}
///
/// void operator () (const size_type i, value_type& update, const bool final_pass) const {
/// update += x_(i);
/// if (final_pass) {
/// x_(i) = update;
/// }
/// }
/// void init (value_type& update) const {
/// update = 0;
/// }
/// void join (volatile value_type& update, volatile const value_type& input) const {
/// update += input;
/// }
///
/// private:
/// Kokkos::View<value_type*, device_type> x_;
/// };
/// \endcode
///
/// Here is an example of a functor which computes an <i>exclusive</i>
/// scan of an array of \c int, in place. In operator(), note both
/// that the final_pass test and the update have switched places, and
/// the use of a temporary. If given an array [1, 2, 3, 4], this scan
/// will overwrite that array with [0, 1, 3, 6].
///
/// \code
/// template<class DeviceType>
/// class ExclScanFunctor {
/// public:
/// typedef DeviceType device_type;
/// typedef int value_type;
/// typedef typename DeviceType::size_type size_type;
///
/// ExclScanFunctor (Kokkos::View<value_type*, device_type> x) : x_ (x) {}
///
/// void operator () (const size_type i, value_type& update, const bool final_pass) const {
/// const value_type x_i = x_(i);
/// if (final_pass) {
/// x_(i) = update;
/// }
/// update += x_i;
/// }
/// void init (value_type& update) const {
/// update = 0;
/// }
/// void join (volatile value_type& update, volatile const value_type& input) const {
/// update += input;
/// }
///
/// private:
/// Kokkos::View<value_type*, device_type> x_;
/// };
/// \endcode
///
/// Here is an example of a functor which builds on the above
/// exclusive scan example, to compute an offsets array from a
/// population count array, in place. We assume that the pop count
/// array has an extra entry at the end to store the final count. If
/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
/// array with [0, 1, 3, 6, 10].
///
/// \code
/// template<class DeviceType>
/// class OffsetScanFunctor {
/// public:
/// typedef DeviceType device_type;
/// typedef int value_type;
/// typedef typename DeviceType::size_type size_type;
///
/// // lastIndex_ is the last valid index (zero-based) of x.
/// // If x has length zero, then lastIndex_ won't be used anyway.
/// ExclScanFunctor (Kokkos::View<value_type*, device_type> x) :
/// x_ (x), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
/// {}
///
/// void operator () (const size_type i, int& update, const bool final_pass) const {
/// const value_type x_i = x_(i);
/// if (final_pass) {
/// x_(i) = update;
/// }
/// update += x_i;
/// // The last entry of x_ gets the final sum.
/// if (final_pass && i == last_index_) {
/// x_(i) = update;
/// }
/// }
/// void init (value_type& update) const {
/// update = 0;
/// }
/// void join (volatile value_type& update, volatile const value_type& input) const {
/// update += input;
/// }
///
/// private:
/// Kokkos::View<value_type*, device_type> x_;
/// const size_type last_index_;
/// };
/// \endcode
///
template< class FunctorType >
inline
void parallel_scan( const size_t work_count ,
const FunctorType & functor )
{
Impl::ParallelScan< FunctorType , size_t > scan( functor , work_count );
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Parallel work request for shared memory, league size, and team size.
*
* If the shared size is too large then slow (global) memory will be used.
* If the league or team size are too large then they will be reduced.
*/
struct ParallelWorkRequest {
size_t league_size ; ///< Size of league (number of teams in a league)
size_t team_size ; ///< Size of team (number of threads in a team)
KOKKOS_INLINE_FUNCTION
ParallelWorkRequest() : league_size(0), team_size(0) {}
KOKKOS_INLINE_FUNCTION
ParallelWorkRequest( size_t s0 , size_t s1 ) : league_size(s0), team_size(s1) {}
};
/** \brief Execute functor in parallel with work request,
* the actual league_size and team_size may be smaller.
*
* class FunctorType {
* public:
* typedef ... device_type ;
* void operator()( device_type ) const ;
* };
*/
template< class FunctorType >
inline
void parallel_for( const ParallelWorkRequest & request ,
const FunctorType & functor )
{
Kokkos::Impl::ParallelFor< FunctorType , ParallelWorkRequest >( functor , request );
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Parallel reduction.
*
* class FunctorType {
* public:
* typedef ... device_type ;
* typedef <podType> value_type ; // POD type
* void operator()( device_type , <podType> & ) const ;
* void init( <podType> & ) const ;
* void join( volatile <podType> & update ,
* volatile const <podType> & input ) const ;
*
* typedef true_type has_final ;
* void final( <podType> & update ) const ;
* };
*
* class FunctorType { // For array of POD value
* public:
* typedef ... device_type ;
* typedef <podType> value_type[] ;
* void operator()( device_type , <podType> update[] ) const ;
* void init( <podType> update[] ) const ;
* void join( volatile <podType> update[] ,
* volatile const <podType> input[] ) const ;
*
* typedef true_type has_final ;
* void final( <podType> update[] ) const ;
* };
*/
template< class FunctorType >
inline
void parallel_reduce( const Kokkos::ParallelWorkRequest & request ,
const FunctorType & functor )
{
Impl::ParallelReduce< FunctorType , Kokkos::ParallelWorkRequest > reduce( functor , request );
}
template< class FunctorType >
inline
void parallel_reduce( const Kokkos::ParallelWorkRequest & request ,
const FunctorType & functor ,
typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
{
Impl::ParallelReduce< FunctorType , Kokkos::ParallelWorkRequest >
reduce( functor , request , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
reduce.wait(); // Wait for reduce to complete and output result
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Enable = void >
struct FunctorHasJoin : public false_type {};
template< class FunctorType >
struct FunctorHasJoin< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::join ) >::type >
: public true_type {};
template< class FunctorType , class Enable = void >
struct FunctorHasFinal : public false_type {};
template< class FunctorType >
struct FunctorHasFinal< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::final ) >::type >
: public true_type {};
template< class FunctorType , class Enable = void >
struct FunctorShmemSize
{
static inline size_t value( const FunctorType & ) { return 0 ; }
};
template< class FunctorType >
struct FunctorShmemSize< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
{
static inline size_t value( const FunctorType & f ) { return f.shmem_size() ; }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ScalarType >
struct ReduceAdapter
{
enum { StaticValueSize = sizeof(ScalarType) };
typedef ScalarType & reference_type ;
typedef ScalarType * pointer_type ;
typedef ScalarType scalar_type ;
KOKKOS_INLINE_FUNCTION static
reference_type reference( void * p ) { return *((ScalarType*) p); }
KOKKOS_INLINE_FUNCTION static
reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)[i]; }
KOKKOS_INLINE_FUNCTION static
pointer_type pointer( reference_type p ) { return & p ; }
KOKKOS_INLINE_FUNCTION static
unsigned value_count( const FunctorType & ) { return 1 ; }
KOKKOS_INLINE_FUNCTION static
unsigned value_size( const FunctorType & ) { return sizeof(ScalarType); }
KOKKOS_INLINE_FUNCTION static
void copy( const FunctorType & , void * const dst , const void * const src )
{ *((scalar_type*)dst) = *((const scalar_type*)src); }
KOKKOS_INLINE_FUNCTION static
void join( const FunctorType & f , volatile void * update , volatile const void * input )
{ f.join( *((volatile ScalarType*)update) , *((volatile const ScalarType*)input) ); }
template< class F >
KOKKOS_INLINE_FUNCTION static
void final( const F & f ,
typename enable_if< ( is_same<F,FunctorType>::value &&
FunctorHasFinal<F>::value )
>::type * p )
{ f.final( *((ScalarType *) p ) ); }
template< class F >
KOKKOS_INLINE_FUNCTION static
void final( const F & ,
typename enable_if< ( is_same<F,FunctorType>::value &&
! FunctorHasFinal<F>::value )
>::type * )
{}
};
template< class FunctorType , class ScalarType >
struct ReduceAdapter< FunctorType , ScalarType[] >
{
enum { StaticValueSize = 0 };
typedef ScalarType * reference_type ;
typedef ScalarType * pointer_type ;
typedef ScalarType scalar_type ;
KOKKOS_INLINE_FUNCTION static
ScalarType * reference( void * p ) { return (ScalarType*) p ; }
KOKKOS_INLINE_FUNCTION static
reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)+i; }
KOKKOS_INLINE_FUNCTION static
pointer_type pointer( reference_type p ) { return p ; }
KOKKOS_INLINE_FUNCTION static
unsigned value_count( const FunctorType & f ) { return f.value_count ; }
KOKKOS_INLINE_FUNCTION static
unsigned value_size( const FunctorType & f ) { return f.value_count * sizeof(ScalarType); }
KOKKOS_INLINE_FUNCTION static
void copy( const FunctorType & f , void * const dst , const void * const src )
{
for ( int i = 0 ; i < int(f.value_count) ; ++i ) {
((scalar_type*)dst)[i] = ((const scalar_type*)src)[i];
}
}
KOKKOS_INLINE_FUNCTION static
void join( const FunctorType & f , volatile void * update , volatile const void * input )
{ f.join( ((volatile ScalarType*)update) , ((volatile const ScalarType*)input) ); }
template< class F >
KOKKOS_INLINE_FUNCTION static
void final( const F & f ,
typename enable_if< ( is_same<F,FunctorType>::value &&
FunctorHasFinal<F>::value )
>::type * p )
{ f.final( ((ScalarType *) p ) ); }
template< class F >
KOKKOS_INLINE_FUNCTION static
void final( const F & ,
typename enable_if< ( is_same<F,FunctorType>::value &&
! FunctorHasFinal<F>::value )
>::type * )
{}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* KOKKOS_PARALLEL_HPP */

View File

@ -0,0 +1,75 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_PARALLELREDUCE_HPP
#define KOKKOS_PARALLELREDUCE_HPP
#include <cstddef>
#include <sstream>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
namespace Kokkos {
//----------------------------------------------------------------------------
template< class FunctorType >
void vector_parallel_reduce( const size_t work_count ,
const FunctorType & functor ,
typename Impl::ReduceAdapter< FunctorType >::reference_type result )
{
Impl::ParallelReduce< FunctorType, VectorParallel >
reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
reduce.wait();
}
//----------------------------------------------------------------------------
} // namespace Kokkos
//----------------------------------------------------------------------------
#endif /* KOKKOS_PARALLELREDUCE_HPP */

View File

@ -0,0 +1,240 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Serial.hpp
/// \brief Declaration and definition of Kokkos::Serial device.
#ifndef KOKKOS_SERIAL_HPP
#define KOKKOS_SERIAL_HPP
#include <cstddef>
#include <iosfwd>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/// \class Serial
/// \brief Kokkos device for non-parallel execution
///
/// A "device" represents a parallel execution model. It tells Kokkos
/// how to parallelize the execution of kernels in a parallel_for or
/// parallel_reduce. For example, the Threads device uses Pthreads or
/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
/// extensions, and the Cuda device uses NVIDIA's CUDA programming
/// model. The Serial device executes "parallel" kernels
/// sequentially. This is useful if you really do not want to use
/// threads, or if you want to explore different combinations of MPI
/// and shared-memory parallel programming models.
class Serial {
public:
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! The device type (same as this class).
typedef Serial device_type ;
//! The size_type typedef best suited for this device.
typedef HostSpace::size_type size_type ;
//! This device's preferred memory space.
typedef HostSpace memory_space ;
//! This device's preferred array layout.
typedef LayoutRight array_layout ;
/// \brief This device's host mirror type.
///
/// Serial is a host device, so the host mirror type is the same as
/// the device type itself.
typedef Serial host_mirror_device_type ;
//@}
/// \brief True if and only if this method is being called in a
/// thread-parallel function.
///
/// For the Serial device, this method <i>always</i> returns false,
/// because parallel_for or parallel_reduce with the Serial device
/// always execute sequentially.
inline static int in_parallel() { return false ; }
/** \brief Set the device in a "sleep" state.
*
* This function sets the device in a "sleep" state in which it is
* not ready for work. This may consume less resources than if the
* device were in an "awake" state, but it may also take time to
* bring the device from a sleep state to be ready for work.
*
* \return True if the device is in the "sleep" state, else false if
* the device is actively working and could not enter the "sleep"
* state.
*/
static bool sleep();
/// \brief Wake the device from the 'sleep' state so it is ready for work.
///
/// \return True if the device is in the "ready" state, else "false"
/// if the device is actively working (which also means that it's
/// awake).
static bool wake();
/// \brief Wait until all dispatched functors complete.
///
/// The parallel_for or parallel_reduce dispatch of a functor may
/// return asynchronously, before the functor completes. This
/// method does not return until all dispatched functors on this
/// device have completed.
static void fence() {}
static void initialize( unsigned threads_count = 1 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 ,
bool allow_asynchronous_threadpool = false) {}
static int is_initialized() { return 1 ; }
//! Free any resources being consumed by the device.
static void finalize() {}
//! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
inline int league_rank() const { return 0 ; }
inline int league_size() const { return 1 ; }
inline int team_rank() const { return 0 ; }
inline int team_size() const { return 1 ; }
inline void team_barrier() {}
inline std::pair<size_t,size_t> work_range( size_t n ) const
{ return std::pair<size_t,size_t>(0,n); }
template< typename T >
inline T * get_shmem( const int count );
static void * resize_reduce_scratch( const unsigned );
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
//TODO: Needs constructor for Kokkos::ParallelWorkRequest CRT
template< class FunctorType , class WorkSpec >
class ParallelFor< FunctorType , WorkSpec , Serial > {
public:
ParallelFor( const FunctorType & functor , const size_t work_count )
{
for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
functor( iwork );
}
}
};
template< class FunctorType , class WorkSpec >
class ParallelReduce< FunctorType , WorkSpec , Serial > {
public:
typedef ReduceAdapter< FunctorType > Reduce ;
typedef typename Reduce::pointer_type pointer_type ;
ParallelReduce( const FunctorType & functor ,
const size_t work_count ,
pointer_type result = 0 )
{
if ( 0 == result ) {
result = (pointer_type ) Serial::resize_reduce_scratch( Reduce::value_size( functor ) );
}
functor.init( Reduce::reference( result ) );
for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
functor( iwork , Reduce::reference( result ) );
}
Reduce::final( functor , result );
}
void wait() {}
};
template< class FunctorType , class WorkSpec >
class ParallelScan< FunctorType , WorkSpec , Kokkos::Serial >
{
public:
typedef ReduceAdapter< FunctorType > Reduce ;
typedef typename Reduce::pointer_type pointer_type ;
inline
ParallelScan( const FunctorType & functor , const size_t work_count )
{
pointer_type result = (pointer_type ) Serial::resize_reduce_scratch( Reduce::value_size( functor ) );
functor.init( Reduce::reference( result ) );
for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
functor( iwork , Reduce::reference( result ) , true );
}
}
void wait() {}
};
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_SERIAL_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,218 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_THREADS_HPP
#define KOKKOS_THREADS_HPP
#include <cstddef>
#include <iosfwd>
#include <Kokkos_Layout.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <Kokkos_HostSpace.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class ThreadsExec ;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Device for a pool of Pthreads or C11 threads on a CPU. */
class Threads {
public:
//! \name Type declarations that all Kokkos devices must provide.
//@{
typedef Threads device_type ;
typedef Kokkos::HostSpace memory_space ;
typedef memory_space::size_type size_type ;
typedef Kokkos::LayoutRight array_layout ;
typedef Kokkos::Threads host_mirror_device_type ;
//@}
/*------------------------------------------------------------------------*/
//! \name Static functions that all Kokkos devices must implement.
//@{
/// \brief True if and only if this method is being called in a
/// thread-parallel function.
static int in_parallel();
/** \brief Set the device in a "sleep" state.
*
* This function sets the device in a "sleep" state in which it is
* not ready for work. This may consume less resources than if the
* device were in an "awake" state, but it may also take time to
* bring the device from a sleep state to be ready for work.
*
* \return True if the device is in the "sleep" state, else false if
* the device is actively working and could not enter the "sleep"
* state.
*/
static bool sleep();
/// \brief Wake the device from the 'sleep' state so it is ready for work.
///
/// \return True if the device is in the "ready" state, else "false"
/// if the device is actively working (which also means that it's
/// awake).
static bool wake();
/// \brief Wait until all dispatched functors complete.
///
/// The parallel_for or parallel_reduce dispatch of a functor may
/// return asynchronously, before the functor completes. This
/// method does not return until all dispatched functors on this
/// device have completed.
static void fence();
/// \brief Free any resources being consumed by the device.
///
/// For the Threads device, this terminates spawned worker threads.
static void finalize();
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
//@}
//! \name Function for the functor device interface */
//@{
KOKKOS_INLINE_FUNCTION int league_rank() const ;
KOKKOS_INLINE_FUNCTION int league_size() const ;
KOKKOS_INLINE_FUNCTION int team_rank() const ;
KOKKOS_INLINE_FUNCTION int team_size() const ;
KOKKOS_INLINE_FUNCTION void team_barrier();
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value );
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename TypeLocal , typename TypeGlobal >
KOKKOS_INLINE_FUNCTION TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
KOKKOS_INLINE_FUNCTION void * get_shmem( const int size );
explicit inline Threads( Impl::ThreadsExec & );
/**@} */
/*------------------------------------------------------------------------*/
//! \name Device-specific functions
//@{
/** \brief Initialize the device in the "ready to work" state.
*
* The device is initialized in a "ready to work" or "awake" state.
* This state reduces latency and thus improves performance when
* dispatching work. However, the "awake" state consumes resources
* even when no work is being done. You may call sleep() to put
* the device in a "sleeping" state that does not consume as many
* resources, but it will take time (latency) to awaken the device
* again (via the wake()) method so that it is ready for work.
*
* Teams of threads are distributed as evenly as possible across
* the requested number of numa regions and cores per numa region.
* A team will not be split across a numa region.
*
* If the 'use_' arguments are not supplied the hwloc is queried
* to use all available cores.
*/
static void initialize( unsigned threads_count = 1 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 ,
bool allow_asynchronous_threadpool = false );
static int is_initialized();
/** \brief Maximum size of a single thread team.
*
* If a parallel_{for,reduce,scan} operation requests a team_size that
* does not satisfy the condition: 0 == team_max() % team_size
* then some threads will idle.
*/
KOKKOS_INLINE_FUNCTION static unsigned team_max();
KOKKOS_INLINE_FUNCTION static unsigned league_max();
//@}
/*------------------------------------------------------------------------*/
private:
friend class Impl::ThreadsExec ;
Impl::ThreadsExec & m_exec ;
};
/*--------------------------------------------------------------------------*/
} // namespace Kokkos
#include <Kokkos_Parallel.hpp>
#include <Threads/Kokkos_ThreadsExec.hpp>
#include <Threads/Kokkos_Threads_Parallel.hpp>
#endif /* #define KOKKOS_THREADS_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,86 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
// Copyright (2012) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Vectorization.hpp
/// \brief Declaration and definition of Kokkos::Vectorization interface.
#ifndef KOKKOS_VECTORIZATION_HPP
#define KOKKOS_VECTORIZATION_HPP
#include <Kokkos_Macros.hpp>
namespace Kokkos {
template<class Device, int N>
struct Vectorization {
enum {increment = 1};
KOKKOS_FORCEINLINE_FUNCTION
static int begin() { return 0;}
KOKKOS_FORCEINLINE_FUNCTION
static int thread_rank(const Device &dev) {
return dev.team_rank();
}
KOKKOS_FORCEINLINE_FUNCTION
static int global_thread_rank(const Device &dev) {
return (dev.league_rank()*dev.team_size()+dev.team_rank());
}
KOKKOS_FORCEINLINE_FUNCTION
static bool is_lane_0(const Device &dev) {
return true;
}
template<class Scalar>
KOKKOS_FORCEINLINE_FUNCTION
static Scalar reduce(const Scalar& val) {
return val;
}
};
}
#if defined( KOKKOS_HAVE_CUDA )
#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More