forked from lijiext/lammps
920 lines
24 KiB
Plaintext
920 lines
24 KiB
Plaintext
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
|
|
Original Version:
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
|
|
-----------------------------------------------------------------------
|
|
|
|
USER-CUDA Package and associated modifications:
|
|
https://sourceforge.net/projects/lammpscuda/
|
|
|
|
Christian Trott, christian.trott@tu-ilmenau.de
|
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
|
|
|
See the README file in the USER-CUDA directory.
|
|
|
|
This software is distributed under the GNU General Public License.
|
|
------------------------------------------------------------------------- */
|
|
|
|
#ifndef CRM_CUDA_UTILS
|
|
#define CRM_CUDA_UTILS
|
|
|
|
//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
|
|
#define MIN(a,b) ((a) < (b) ? (a) : (b))
|
|
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
|
|
|
inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
|
|
{
|
|
int3 gridparams;
|
|
int sharedsize = 16000;
|
|
|
|
if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
|
|
|
|
if((n < 60 * 32) || (threadsmax < 64))
|
|
gridparams.z = 32;
|
|
else if((n < 60 * 64) || (threadsmax < 128))
|
|
gridparams.z = 64;
|
|
else if((n < 60 * 128) || (threadsmax < 256))
|
|
gridparams.z = 128;
|
|
else if((n < 60 * 256) || (threadsmax < 512))
|
|
gridparams.z = 256;
|
|
else gridparams.z = 512;
|
|
|
|
if(p2) {
|
|
gridparams.z = 16;
|
|
|
|
while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
|
|
}
|
|
|
|
|
|
int blocks = (n + gridparams.z - 1) / gridparams.z;
|
|
|
|
if(blocks > 10000)
|
|
gridparams.x = gridparams.y = int(sqrt(blocks));
|
|
else {
|
|
gridparams.x = blocks;
|
|
gridparams.y = 1;
|
|
}
|
|
|
|
while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
|
|
|
|
if(gridparams.x == 0) gridparams.x = 1;
|
|
|
|
return gridparams;
|
|
}
|
|
|
|
//return value: 1 if f<0; else: 0
|
|
//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
|
|
static inline __device__ int negativCUDA(float f)
|
|
{
|
|
return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
|
|
}
|
|
|
|
//return value: -1 if f<0; else +1
|
|
static inline __device__ float fsignCUDA(float f)
|
|
{
|
|
return f < 0.0f ? -1.0f : 1.0f;
|
|
}
|
|
|
|
//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
|
|
//blockDim.y and blockDim.z are assumed to be 1
|
|
static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
|
|
{
|
|
int i, k;
|
|
k = n - blockDim.x;
|
|
|
|
for(i = 0; i < k; i += blockDim.x) {
|
|
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
|
}
|
|
|
|
if(threadIdx.x < n - i) {
|
|
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
|
|
{
|
|
int i, k;
|
|
k = n - blockDim.x;
|
|
|
|
for(i = 0; i < k; i += blockDim.x) {
|
|
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
|
}
|
|
|
|
if(threadIdx.x < n - i) {
|
|
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
|
|
{
|
|
int i, k;
|
|
k = n - blockDim.x;
|
|
|
|
for(i = 0; i < k; i += blockDim.x) {
|
|
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
|
}
|
|
|
|
if(threadIdx.x < n - i) {
|
|
glob[i + threadIdx.x] = shared[i + threadIdx.x];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
|
|
{
|
|
int i, k;
|
|
k = n - blockDim.x;
|
|
|
|
for(i = 0; i < k; i += blockDim.x) {
|
|
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
|
}
|
|
|
|
if(threadIdx.x < n - i) {
|
|
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
|
|
{
|
|
int i, k;
|
|
k = n - blockDim.x;
|
|
|
|
for(i = 0; i < k; i += blockDim.x) {
|
|
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
|
}
|
|
|
|
if(threadIdx.x < n - i) {
|
|
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
|
|
{
|
|
int i;
|
|
|
|
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
|
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
|
}
|
|
|
|
if(threadIdx.x < n - i) {
|
|
shared[i + threadIdx.x] = glob[i + threadIdx.x];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
//copy data between two memory areas on device, 3d BlockDims are allowed
|
|
static __device__ inline void copyData(double* source, double* target, const int &n)
|
|
{
|
|
int i;
|
|
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
|
|
|
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
if(offset < n - i) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static __device__ inline void copyData(float* source, float* target, const int &n)
|
|
{
|
|
int i;
|
|
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
|
|
|
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
if(offset < n - i) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static __device__ inline void copyData(int* source, int* target, const int &n)
|
|
{
|
|
int i;
|
|
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
|
|
|
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
if(offset < n - i) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
|
|
{
|
|
int i;
|
|
int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
|
|
|
|
for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
if(offset < n - i) {
|
|
target[i + offset] = source[i + offset];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
|
|
//in the end in data[0]=sum_i=0^blockDim.x data[i]
|
|
//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
|
|
static __device__ inline void reduceBlockP2(int* data)
|
|
{
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= blockDim.x; i *= 2) {
|
|
if(threadIdx.x < blockDim.x / i)
|
|
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduceBlockP2(unsigned int* data)
|
|
{
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= blockDim.x; i *= 2) {
|
|
if(threadIdx.x < blockDim.x / i)
|
|
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduceBlockP2(float* data)
|
|
{
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= blockDim.x; i *= 2) {
|
|
if(threadIdx.x < blockDim.x / i)
|
|
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduceBlockP2(double* data)
|
|
{
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= blockDim.x; i *= 2) {
|
|
if(threadIdx.x < blockDim.x / i)
|
|
data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduceBlock(float* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] += data[threadIdx.x + p2];
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduceBlock(int* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] += data[threadIdx.x + p2];
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduceBlock(unsigned int* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] += data[threadIdx.x + p2];
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduceBlock(double* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] += data[threadIdx.x + p2];
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] += data[threadIdx.x + p2 / i];
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
|
|
{
|
|
int i;
|
|
|
|
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
|
data[i + threadIdx.x] = value;
|
|
}
|
|
|
|
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
|
|
}
|
|
|
|
static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
|
|
{
|
|
int i;
|
|
|
|
for(i = 0; i < n - blockDim.x; i += blockDim.x) {
|
|
data[i + threadIdx.x] = value;
|
|
}
|
|
|
|
if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
|
|
}
|
|
|
|
static __device__ inline void reduce(float* data, int n) //cautious not sure if working
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < n) p2 *= 2;
|
|
|
|
int j = 0;
|
|
|
|
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
|
|
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
|
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void reduce(double* data, int n) //cautious not sure if working
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < n) p2 *= 2;
|
|
|
|
int j = 0;
|
|
|
|
while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
|
|
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
|
data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void minOfBlock(float* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void maxOfBlock(float* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void minOfBlock(double* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void maxOfBlock(double* data)
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < blockDim.x) p2 *= 2;
|
|
|
|
if(threadIdx.x < blockDim.x - p2)
|
|
data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
if(threadIdx.x < p2 / i)
|
|
data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
|
|
static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < n) p2 *= 2;
|
|
|
|
int j = 0;
|
|
|
|
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
|
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
|
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < n) p2 *= 2;
|
|
|
|
int j = 0;
|
|
|
|
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
|
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
|
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < n) p2 *= 2;
|
|
|
|
int j = 0;
|
|
|
|
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
|
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
|
data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
|
|
{
|
|
__syncthreads();
|
|
int p2 = 1;
|
|
|
|
while(p2 * 2 < n) p2 *= 2;
|
|
|
|
int j = 0;
|
|
|
|
while((threadIdx.x + blockDim.x * j) < n - p2) {
|
|
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int i = 2; i <= p2; i *= 2) {
|
|
while((threadIdx.x + blockDim.x * j) < p2 / i) {
|
|
data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
|
|
j++;
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
#if X_PRECISION == 2
|
|
static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
|
|
{
|
|
int2 v = tex1Dfetch(t, i);
|
|
return __hiloint2double(v.y, v.x);
|
|
}
|
|
|
|
static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
|
|
{
|
|
int4 v = tex1Dfetch(t, 2 * i);
|
|
int4 u = tex1Dfetch(t, 2 * i + 1);
|
|
X_FLOAT4 w;
|
|
|
|
w.x = __hiloint2double(v.y, v.x);
|
|
w.y = __hiloint2double(v.w, v.z);
|
|
w.z = __hiloint2double(u.y, u.x);
|
|
w.w = __hiloint2double(u.w, u.z);
|
|
return w;
|
|
}
|
|
#endif
|
|
|
|
inline void BindXTypeTexture(cuda_shared_data* sdata)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
_x_type_tex.normalized = false; // access with normalized texture coordinates
|
|
_x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
|
_x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
|
const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
|
|
|
|
#if X_PRECISION == 1
|
|
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
|
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
|
|
#else
|
|
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
|
|
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
static __device__ inline X_FLOAT4 fetchXType(int i)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
#if X_PRECISION == 1
|
|
return tex1Dfetch(_x_type_tex, i);
|
|
#else
|
|
return tex1Dfetch_double(_x_type_tex, i);
|
|
#endif
|
|
#else
|
|
return _x_type[i];
|
|
#endif
|
|
}
|
|
|
|
#if V_PRECISION == 2
|
|
static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
|
|
{
|
|
int2 v = tex1Dfetch(t, i);
|
|
return __hiloint2double(v.y, v.x);
|
|
}
|
|
|
|
static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
|
|
{
|
|
int4 v = tex1Dfetch(t, 2 * i);
|
|
int4 u = tex1Dfetch(t, 2 * i + 1);
|
|
V_FLOAT4 w;
|
|
|
|
w.x = __hiloint2double(v.y, v.x);
|
|
w.y = __hiloint2double(v.w, v.z);
|
|
w.z = __hiloint2double(u.y, u.x);
|
|
w.w = __hiloint2double(u.w, u.z);
|
|
return w;
|
|
}
|
|
#endif
|
|
|
|
inline void BindVRadiusTexture(cuda_shared_data* sdata)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
_v_radius_tex.normalized = false; // access with normalized texture coordinates
|
|
_v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
|
_v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
|
const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
|
|
|
|
#if V_PRECISION == 1
|
|
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
|
|
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
|
|
#else
|
|
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
|
|
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
static __device__ inline V_FLOAT4 fetchVRadius(int i)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
#if V_PRECISION == 1
|
|
return tex1Dfetch(_v_radius_tex, i);
|
|
#else
|
|
return tex1Dfetch_double_v(_v_radius_tex, i);
|
|
#endif
|
|
#else
|
|
return _v_radius[i];
|
|
#endif
|
|
}
|
|
|
|
inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
_omega_rmass_tex.normalized = false; // access with normalized texture coordinates
|
|
_omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
|
_omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
|
const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
|
|
|
|
#if V_PRECISION == 1
|
|
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
|
|
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
|
|
#else
|
|
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
|
|
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
#if V_PRECISION == 1
|
|
return tex1Dfetch(_omega_rmass_tex, i);
|
|
#else
|
|
return tex1Dfetch_double_v(_omega_rmass_tex, i);
|
|
#endif
|
|
#else
|
|
return _omega_rmass[i];
|
|
#endif
|
|
}
|
|
|
|
#if F_PRECISION == 2
|
|
static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
|
|
{
|
|
int2 v = tex1Dfetch(t, i);
|
|
return __hiloint2double(v.y, v.x);
|
|
}
|
|
|
|
static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
|
|
{
|
|
int4 v = tex1Dfetch(t, 2 * i);
|
|
int4 u = tex1Dfetch(t, 2 * i + 1);
|
|
F_FLOAT4 w;
|
|
|
|
w.x = __hiloint2double(v.y, v.x);
|
|
w.y = __hiloint2double(v.w, v.z);
|
|
w.z = __hiloint2double(u.y, u.x);
|
|
w.w = __hiloint2double(u.w, u.z);
|
|
return w;
|
|
}
|
|
#endif
|
|
|
|
inline void BindQTexture(cuda_shared_data* sdata)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
_q_tex.normalized = false; // access with normalized texture coordinates
|
|
_q_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
|
_q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
|
const textureReference* q_texture_ptr = &MY_AP(q_tex);
|
|
|
|
#if F_PRECISION == 1
|
|
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
|
|
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
|
|
#else
|
|
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
|
|
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
static __device__ inline F_FLOAT fetchQ(int i)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
#if F_PRECISION == 1
|
|
return tex1Dfetch(_q_tex, i);
|
|
#else
|
|
return tex1Dfetch_double_f(_q_tex, i);
|
|
#endif
|
|
#else
|
|
return _q[i];
|
|
#endif
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
_coeff_tex.normalized = false; // access with normalized texture coordinates
|
|
_coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
|
_coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates
|
|
const textureReference* coeff_texture_ptr;
|
|
cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
|
|
|
|
#if F_PRECISION == 1
|
|
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
|
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
|
|
#else
|
|
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
|
|
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
static __device__ inline X_FLOAT4 fetchXType(int i)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
#if X_PRECISION == 1
|
|
return tex1Dfetch(_x_type_tex,i);
|
|
#else
|
|
return tex1Dfetch_double(_x_type_tex,i);
|
|
#endif
|
|
#else
|
|
return _x_type[i];
|
|
#endif
|
|
}
|
|
*/
|
|
#define SBBITS 30
|
|
|
|
static inline __device__ int sbmask(int j)
|
|
{
|
|
return j >> SBBITS & 3;
|
|
}
|
|
|
|
static inline __device__ void minimum_image(X_FLOAT4 &delta)
|
|
{
|
|
if(_triclinic == 0) {
|
|
if(_periodicity[0]) {
|
|
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
|
|
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
|
|
}
|
|
|
|
if(_periodicity[1]) {
|
|
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
|
|
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
|
|
}
|
|
|
|
if(_periodicity[2]) {
|
|
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
|
|
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
|
|
}
|
|
|
|
} else {
|
|
if(_periodicity[1]) {
|
|
delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
|
|
(delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
|
|
delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
|
|
(delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
|
|
delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
|
|
(delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
|
|
|
|
}
|
|
|
|
if(_periodicity[1]) {
|
|
delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
|
|
(delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
|
|
delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
|
|
(delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
|
|
|
|
}
|
|
|
|
if(_periodicity[0]) {
|
|
delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
|
|
(delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
|
|
{
|
|
ci.x = x2.x - x1.x;
|
|
ci.y = x2.y - x1.y;
|
|
ci.z = x2.z - x1.z;
|
|
minimum_image(ci);
|
|
ci.x += x1.x;
|
|
ci.y += x1.y;
|
|
ci.z += x1.z;
|
|
}
|