lammps/lib/gpu/pair_gpu_cell.cu

490 lines
14 KiB
Plaintext

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
Peng Wang (Nvidia), penwang@nvidia.com
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#if defined(__APPLE__)
#if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS
#endif // _GLIBCXX_ATOMIC_BUILTINS
#endif // __APPLE__
#include <assert.h>
#include "lj_gpu_memory.h"
#include "pair_gpu_cell.h"
static __constant__ float d_boxlo[3];
static __constant__ float d_boxhi[3];
static __constant__ float d_cell_size[1];
static __constant__ float d_skin[1];
void init_cell_list_const(double cell_size, double skin,
double *boxlo, double *boxhi)
{
float cell_size1 = cell_size;
float skin1 = skin;
float boxlo1[3], boxhi1[3];
for (int i = 0; i < 3; i++) {
boxlo1[i] = boxlo[i];
boxhi1[i] = boxhi[i];
}
cudaMemcpyToSymbol(d_cell_size, &cell_size1, sizeof(float),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_boxlo, boxlo1, 3*sizeof(float),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_boxhi, boxhi1, 3*sizeof(float),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_skin, &skin1, sizeof(float),
0, cudaMemcpyHostToDevice);
}
__global__ void kernel_set_cell_list(unsigned int *cell_idx)
{
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
cell_idx[gid] = BIG_NUMBER;
}
// build the cell list
__global__ void kernel_build_cell_list(float3 *cell_list,
unsigned int *cell_idx,
int *cell_type,
int *cell_atom,
float3 *pos,
int *type,
const int inum,
const int nall,
const int cell_size)
{
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
float cSize = d_cell_size[0];
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
if (gid < nall) {
float3 p = pos[gid];
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
p.x = fminf(p.x, d_boxhi[0]+cSize);
p.y = fmaxf(p.y, d_boxlo[1]-cSize);
p.y = fminf(p.y, d_boxhi[1]+cSize);
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
p.z = fminf(p.z, d_boxhi[2]+cSize);
int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx
+ (int)(p.z/cSize + 1.0) * ncellx * ncelly;
int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
int pid = cell_id*cell_size + atom_pos;
cell_list[pid] = pos[gid];
cell_type[pid] = type[gid];
cell_idx [pid] = gid;
}
}
__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild)
{
float cSize = d_cell_size[0];
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % ncelly;
int bz = blockIdx.y / ncelly;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
float skin = d_skin[0];
float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
float hix = lowx + cSize + skin;
float lowy = d_boxlo[1] + (by-1)*cSize - 0.5*skin;
float hiy = lowy + cSize + skin;
float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
float hiz = lowz + cSize + skin;
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
int pid = pbase + i;
float3 p = cell_list[pid];
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
p.x = fminf(p.x, d_boxhi[0]+cSize);
p.y = fmaxf(p.y, d_boxlo[1]-cSize);
p.y = fminf(p.y, d_boxhi[1]+cSize);
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
p.z = fminf(p.z, d_boxhi[2]+cSize);
if (p.x < lowx || p.x > hix || p.y < lowy || p.y > hiy || p.z < lowz || p.z > hiz) {
*rebuild = 1;
}
}
}
__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell)
{
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
if (gid < ncell) {
if (cell_atom[gid] > blockDim.x)
*overflow = 1;
}
}
__global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
{
float cSize = d_cell_size[0];
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % ncelly;
int bz = blockIdx.y / ncelly;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
int pid = pbase + i;
cell_list[pid] = pos[cell_idx[pid]];
}
}
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit);
#ifdef __DEVICE_EMULATION__
#define __SYNC __syncthreads();
#else
#define __SYNC
#endif
#define WARP_SIZE 32
template<class T, int maxlevel>
__device__ T scanwarp(T val, T* sData)
{
// The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp =
// 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
sData[idx] = 0;
idx += WARP_SIZE;
sData[idx] = val; __SYNC
#ifdef __DEVICE_EMULATION__
T t = sData[idx - 1]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 2]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 4]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 8]; __SYNC
sData[idx] += t; __SYNC
t = sData[idx - 16]; __SYNC
sData[idx] += t; __SYNC
#else
if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC
if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC
if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC
if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC
if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC
#endif
return sData[idx] - val; // convert inclusive -> exclusive
}
__device__ unsigned int scan(unsigned int idata)
{
extern __shared__ unsigned int ptr[];
unsigned int idx = threadIdx.x;
unsigned int val = idata;
val = scanwarp<unsigned int, 4>(val, ptr);
__syncthreads();
if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
{
ptr[idx >> 5] = val + idata;
}
__syncthreads();
#ifndef __DEVICE_EMULATION__
if (idx < WARP_SIZE)
#endif
{
ptr[idx] = scanwarp<unsigned int, 2>(ptr[idx], ptr);
}
__syncthreads();
val += ptr[idx >> 5];
return val;
}
__device__ unsigned int rank(unsigned int preds)
{
unsigned int address = scan(preds);
__shared__ unsigned int numtrue;
if (threadIdx.x == blockDim.x - 1)
{
numtrue = address + preds;
}
__syncthreads();
unsigned int rank;
unsigned int idx = threadIdx.x;
rank = (preds) ? address : numtrue + idx - address;
return rank;
}
template<int blockSize>
__device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
{
extern __shared__ unsigned int sMem1[];
__shared__ float sMem2[blockSize];
__shared__ int sMem3[blockSize];
int tid = threadIdx.x;
for(unsigned int shift = startbit; shift < (startbit + nbits); ++shift) {
unsigned int lsb;
lsb = !(((*key) >> shift) & 0x1);
unsigned int r;
r = rank(lsb);
// This arithmetic strides the ranks across 4 CTA_SIZE regions
sMem1[r] = *key;
__syncthreads();
// The above allows us to read without 4-way bank conflicts:
*key = sMem1[tid];
__syncthreads();
sMem2[r] = (*value1).x;
__syncthreads();
(*value1).x = sMem2[tid];
__syncthreads();
sMem2[r] = (*value1).y;
__syncthreads();
(*value1).y = sMem2[tid];
__syncthreads();
sMem2[r] = (*value1).z;
__syncthreads();
(*value1).z = sMem2[tid];
__syncthreads();
sMem3[r] = *value2;
__syncthreads();
*value2 = sMem3[tid];
__syncthreads();
}
}
__global__ void radixSortBlocks(unsigned int *keys,
float3 *values1,
int *values2,
unsigned int nbits,
unsigned int startbit)
{
extern __shared__ unsigned int sMem[];
int gid = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int key;
float3 value1;
int value2;
key = keys[gid];
value1 = values1[gid];
value2 = values2[gid];
__syncthreads();
if (blockDim.x == 64)
radixSortBlock<64>(&key, &value1, &value2, nbits, startbit);
else if (blockDim.x == 128)
radixSortBlock<128>(&key, &value1, &value2, nbits, startbit);
else if (blockDim.x == 256)
radixSortBlock<256>(&key, &value1, &value2, nbits, startbit);
keys[gid] = key;
values1[gid] = value1;
values2[gid] = value2;
}
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size)
{
int i = 0;
const unsigned int bitSize = sizeof(unsigned int)*8;
const unsigned int bitStep = 4;
const int gSize = size/cell_size;
while (bitSize > i*bitStep) {
radixSortBlocks<<<gSize, cell_size, 2*cell_size*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
i++;
}
}
static float3 *d_pos, *pos_temp;
static int *d_type;
static int *d_overflow, *d_rebuild;
void init_cell_list(cell_list &cell_list_gpu,
const int nall,
const int ncell,
const int buffer)
{
cudaMalloc((void**)&(cell_list_gpu.pos), ncell*buffer*sizeof(float3));
cudaMalloc((void**)&(cell_list_gpu.idx), ncell*buffer*sizeof(unsigned int));
cudaMalloc((void**)&(cell_list_gpu.type), ncell*buffer*sizeof(int));
cudaMalloc((void**)&(cell_list_gpu.natom), ncell*sizeof(int));
cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
cudaMalloc((void**)&d_pos, nall*sizeof(float3));
cudaMalloc((void**)&d_type, nall*sizeof(int));
cudaMalloc((void**)&d_overflow, sizeof(int));
cudaMalloc((void**)&d_rebuild, sizeof(int));
cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
cudaMemset(cell_list_gpu.pos, 0, ncell*buffer*sizeof(float3));
}
void clear_cell_list(cell_list &cell_list_gpu)
{
cudaFree(cell_list_gpu.pos);
cudaFree(cell_list_gpu.idx);
cudaFree(cell_list_gpu.natom);
cudaFree(cell_list_gpu.type);
cudaFreeHost(pos_temp);
cudaFree(d_pos);
cudaFree(d_type);
cudaFree(d_overflow);
cudaFree(d_rebuild);
}
void build_cell_list(double *atom_pos, int *atom_type,
cell_list &cell_list_gpu,
const int ncell, const int ncellx, const int ncelly, const int ncellz,
const int buffer, const int inum, const int nall, const int ago)
{
cudaError_t err;
cudaMemset(d_overflow, 0, sizeof(int));
cudaMemset(d_rebuild, 0, sizeof(int));
// copy position and type to GPU
for (int i = 0; i < 3*nall; i+=3) {
pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
}
cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy(d_type, atom_type, nall*sizeof(int), cudaMemcpyHostToDevice);
static int first_build = 1;
int rebuild = 0;
// copy the last built cell-list and test whether it needs to be rebuilt
if (!first_build) {
dim3 grid(ncellx, ncelly*ncellz);
kernel_copy_list<<<grid, buffer>>>(cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.natom, d_pos);
cudaMemset(d_rebuild, 0, sizeof(int));
kernel_test_rebuild<<<grid, buffer>>>(cell_list_gpu.pos,
cell_list_gpu.natom,
d_rebuild);
cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
err = cudaGetLastError();
assert(err == cudaSuccess);
}
if (ago == 0) rebuild = 1;
// build cell-list for the first time
if (first_build || rebuild) {
first_build = 0;
// cout << "Building cell list..." << endl;
cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
// initialize d_cell_idx for the sorting routine
kernel_set_cell_list<<<ncell, buffer>>>(cell_list_gpu.idx);
// build cell list
dim3 blockDim(128);
dim3 gridDim(static_cast<int>(ceil(static_cast<double>(nall)/blockDim.x)));
kernel_build_cell_list<<<gridDim, blockDim>>>(cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.type,
cell_list_gpu.natom,
d_pos, d_type, inum, nall, buffer);
err = cudaGetLastError();
assert(err == cudaSuccess);
// check cell list overflow
int overflow = 0;
int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/buffer));
kernel_test_overflow<<<gDimCell, buffer>>>(cell_list_gpu.natom,
d_overflow, ncell);
cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
if (overflow > 0) {
printf("\n BLOCK_1D too small for cell list, please increase it!");
printf("\n BLOCK_1D = %d",BLOCK_1D);
printf("\n ncell = %d",ncell);
printf("\n gDimCell = %d",gDimCell);
printf("\n overflow = %d \n",overflow);
exit(0);
}
// sort atoms in every cell by atom index to avoid floating point associativity problem.
sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos,
cell_list_gpu.type, ncell*buffer, buffer);
cudaThreadSynchronize();
err = cudaGetLastError();
assert(err == cudaSuccess);
}
}