forked from lijiext/lammps
490 lines
14 KiB
Plaintext
490 lines
14 KiB
Plaintext
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
|
|
Peng Wang (Nvidia), penwang@nvidia.com
|
|
Paul Crozier (SNL), pscrozi@sandia.gov
|
|
------------------------------------------------------------------------- */
|
|
|
|
#if defined(__APPLE__)
|
|
#if _GLIBCXX_ATOMIC_BUILTINS == 1
|
|
#undef _GLIBCXX_ATOMIC_BUILTINS
|
|
#endif // _GLIBCXX_ATOMIC_BUILTINS
|
|
#endif // __APPLE__
|
|
|
|
#include <assert.h>
|
|
#include "lj_gpu_memory.h"
|
|
#include "pair_gpu_cell.h"
|
|
|
|
static __constant__ float d_boxlo[3];
|
|
static __constant__ float d_boxhi[3];
|
|
static __constant__ float d_cell_size[1];
|
|
static __constant__ float d_skin[1];
|
|
|
|
void init_cell_list_const(double cell_size, double skin,
|
|
double *boxlo, double *boxhi)
|
|
{
|
|
float cell_size1 = cell_size;
|
|
float skin1 = skin;
|
|
float boxlo1[3], boxhi1[3];
|
|
for (int i = 0; i < 3; i++) {
|
|
boxlo1[i] = boxlo[i];
|
|
boxhi1[i] = boxhi[i];
|
|
}
|
|
|
|
cudaMemcpyToSymbol(d_cell_size, &cell_size1, sizeof(float),
|
|
0, cudaMemcpyHostToDevice);
|
|
cudaMemcpyToSymbol(d_boxlo, boxlo1, 3*sizeof(float),
|
|
0, cudaMemcpyHostToDevice);
|
|
cudaMemcpyToSymbol(d_boxhi, boxhi1, 3*sizeof(float),
|
|
0, cudaMemcpyHostToDevice);
|
|
cudaMemcpyToSymbol(d_skin, &skin1, sizeof(float),
|
|
0, cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
__global__ void kernel_set_cell_list(unsigned int *cell_idx)
|
|
{
|
|
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
|
|
cell_idx[gid] = BIG_NUMBER;
|
|
}
|
|
|
|
// build the cell list
|
|
__global__ void kernel_build_cell_list(float3 *cell_list,
|
|
unsigned int *cell_idx,
|
|
int *cell_type,
|
|
int *cell_atom,
|
|
float3 *pos,
|
|
int *type,
|
|
const int inum,
|
|
const int nall,
|
|
const int cell_size)
|
|
{
|
|
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
|
|
float cSize = d_cell_size[0];
|
|
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
|
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
|
|
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
|
|
|
|
if (gid < nall) {
|
|
float3 p = pos[gid];
|
|
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
|
|
p.x = fminf(p.x, d_boxhi[0]+cSize);
|
|
p.y = fmaxf(p.y, d_boxlo[1]-cSize);
|
|
p.y = fminf(p.y, d_boxhi[1]+cSize);
|
|
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
|
|
p.z = fminf(p.z, d_boxhi[2]+cSize);
|
|
|
|
int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx
|
|
+ (int)(p.z/cSize + 1.0) * ncellx * ncelly;
|
|
|
|
int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
|
|
int pid = cell_id*cell_size + atom_pos;
|
|
|
|
cell_list[pid] = pos[gid];
|
|
cell_type[pid] = type[gid];
|
|
cell_idx [pid] = gid;
|
|
|
|
}
|
|
}
|
|
|
|
__global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebuild)
|
|
{
|
|
|
|
float cSize = d_cell_size[0];
|
|
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
|
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
|
|
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
|
|
|
|
// calculate 3D block idx from 2d block
|
|
int bx = blockIdx.x;
|
|
int by = blockIdx.y % ncelly;
|
|
int bz = blockIdx.y / ncelly;
|
|
|
|
int tid = threadIdx.x;
|
|
|
|
// compute cell idx from 3D block idx
|
|
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
|
|
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
|
|
|
|
float skin = d_skin[0];
|
|
float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
|
|
float hix = lowx + cSize + skin;
|
|
float lowy = d_boxlo[1] + (by-1)*cSize - 0.5*skin;
|
|
float hiy = lowy + cSize + skin;
|
|
float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
|
|
float hiz = lowz + cSize + skin;
|
|
|
|
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
|
|
int pid = pbase + i;
|
|
float3 p = cell_list[pid];
|
|
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
|
|
p.x = fminf(p.x, d_boxhi[0]+cSize);
|
|
p.y = fmaxf(p.y, d_boxlo[1]-cSize);
|
|
p.y = fminf(p.y, d_boxhi[1]+cSize);
|
|
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
|
|
p.z = fminf(p.z, d_boxhi[2]+cSize);
|
|
|
|
if (p.x < lowx || p.x > hix || p.y < lowy || p.y > hiy || p.z < lowz || p.z > hiz) {
|
|
*rebuild = 1;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
__global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int ncell)
|
|
{
|
|
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
|
|
|
|
if (gid < ncell) {
|
|
if (cell_atom[gid] > blockDim.x)
|
|
*overflow = 1;
|
|
}
|
|
}
|
|
|
|
__global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
|
|
{
|
|
float cSize = d_cell_size[0];
|
|
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
|
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
|
|
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
|
|
|
|
// calculate 3D block idx from 2d block
|
|
int bx = blockIdx.x;
|
|
int by = blockIdx.y % ncelly;
|
|
int bz = blockIdx.y / ncelly;
|
|
|
|
int tid = threadIdx.x;
|
|
|
|
// compute cell idx from 3D block idx
|
|
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
|
|
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
|
|
|
|
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
|
|
int pid = pbase + i;
|
|
cell_list[pid] = pos[cell_idx[pid]];
|
|
}
|
|
|
|
}
|
|
|
|
|
|
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit);
|
|
|
|
|
|
|
|
#ifdef __DEVICE_EMULATION__
|
|
#define __SYNC __syncthreads();
|
|
#else
|
|
#define __SYNC
|
|
#endif
|
|
|
|
|
|
#define WARP_SIZE 32
|
|
|
|
template<class T, int maxlevel>
|
|
__device__ T scanwarp(T val, T* sData)
|
|
{
|
|
// The following is the same as 2 * RadixSort::WARP_SIZE * warpId + threadInWarp =
|
|
// 64*(threadIdx.x >> 5) + (threadIdx.x & (RadixSort::WARP_SIZE - 1))
|
|
int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
|
|
sData[idx] = 0;
|
|
idx += WARP_SIZE;
|
|
sData[idx] = val; __SYNC
|
|
|
|
#ifdef __DEVICE_EMULATION__
|
|
T t = sData[idx - 1]; __SYNC
|
|
sData[idx] += t; __SYNC
|
|
t = sData[idx - 2]; __SYNC
|
|
sData[idx] += t; __SYNC
|
|
t = sData[idx - 4]; __SYNC
|
|
sData[idx] += t; __SYNC
|
|
t = sData[idx - 8]; __SYNC
|
|
sData[idx] += t; __SYNC
|
|
t = sData[idx - 16]; __SYNC
|
|
sData[idx] += t; __SYNC
|
|
#else
|
|
if (0 <= maxlevel) { sData[idx] += sData[idx - 1]; } __SYNC
|
|
if (1 <= maxlevel) { sData[idx] += sData[idx - 2]; } __SYNC
|
|
if (2 <= maxlevel) { sData[idx] += sData[idx - 4]; } __SYNC
|
|
if (3 <= maxlevel) { sData[idx] += sData[idx - 8]; } __SYNC
|
|
if (4 <= maxlevel) { sData[idx] += sData[idx -16]; } __SYNC
|
|
#endif
|
|
|
|
return sData[idx] - val; // convert inclusive -> exclusive
|
|
}
|
|
|
|
__device__ unsigned int scan(unsigned int idata)
|
|
{
|
|
extern __shared__ unsigned int ptr[];
|
|
|
|
unsigned int idx = threadIdx.x;
|
|
|
|
unsigned int val = idata;
|
|
|
|
val = scanwarp<unsigned int, 4>(val, ptr);
|
|
__syncthreads();
|
|
|
|
if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
|
|
{
|
|
ptr[idx >> 5] = val + idata;
|
|
}
|
|
__syncthreads();
|
|
|
|
#ifndef __DEVICE_EMULATION__
|
|
if (idx < WARP_SIZE)
|
|
#endif
|
|
{
|
|
ptr[idx] = scanwarp<unsigned int, 2>(ptr[idx], ptr);
|
|
}
|
|
__syncthreads();
|
|
|
|
val += ptr[idx >> 5];
|
|
|
|
return val;
|
|
}
|
|
|
|
|
|
__device__ unsigned int rank(unsigned int preds)
|
|
{
|
|
unsigned int address = scan(preds);
|
|
|
|
__shared__ unsigned int numtrue;
|
|
if (threadIdx.x == blockDim.x - 1)
|
|
{
|
|
numtrue = address + preds;
|
|
}
|
|
__syncthreads();
|
|
|
|
unsigned int rank;
|
|
unsigned int idx = threadIdx.x;
|
|
rank = (preds) ? address : numtrue + idx - address;
|
|
|
|
return rank;
|
|
}
|
|
|
|
template<int blockSize>
|
|
__device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
|
|
{
|
|
extern __shared__ unsigned int sMem1[];
|
|
__shared__ float sMem2[blockSize];
|
|
__shared__ int sMem3[blockSize];
|
|
|
|
int tid = threadIdx.x;
|
|
|
|
for(unsigned int shift = startbit; shift < (startbit + nbits); ++shift) {
|
|
unsigned int lsb;
|
|
lsb = !(((*key) >> shift) & 0x1);
|
|
|
|
unsigned int r;
|
|
|
|
r = rank(lsb);
|
|
|
|
// This arithmetic strides the ranks across 4 CTA_SIZE regions
|
|
sMem1[r] = *key;
|
|
__syncthreads();
|
|
|
|
// The above allows us to read without 4-way bank conflicts:
|
|
*key = sMem1[tid];
|
|
__syncthreads();
|
|
|
|
sMem2[r] = (*value1).x;
|
|
__syncthreads();
|
|
(*value1).x = sMem2[tid];
|
|
__syncthreads();
|
|
|
|
sMem2[r] = (*value1).y;
|
|
__syncthreads();
|
|
(*value1).y = sMem2[tid];
|
|
__syncthreads();
|
|
|
|
sMem2[r] = (*value1).z;
|
|
__syncthreads();
|
|
(*value1).z = sMem2[tid];
|
|
__syncthreads();
|
|
|
|
sMem3[r] = *value2;
|
|
__syncthreads();
|
|
*value2 = sMem3[tid];
|
|
__syncthreads();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
__global__ void radixSortBlocks(unsigned int *keys,
|
|
float3 *values1,
|
|
int *values2,
|
|
unsigned int nbits,
|
|
unsigned int startbit)
|
|
{
|
|
|
|
extern __shared__ unsigned int sMem[];
|
|
|
|
int gid = threadIdx.x + blockIdx.x * blockDim.x;
|
|
unsigned int key;
|
|
float3 value1;
|
|
int value2;
|
|
key = keys[gid];
|
|
value1 = values1[gid];
|
|
value2 = values2[gid];
|
|
__syncthreads();
|
|
|
|
if (blockDim.x == 64)
|
|
radixSortBlock<64>(&key, &value1, &value2, nbits, startbit);
|
|
else if (blockDim.x == 128)
|
|
radixSortBlock<128>(&key, &value1, &value2, nbits, startbit);
|
|
else if (blockDim.x == 256)
|
|
radixSortBlock<256>(&key, &value1, &value2, nbits, startbit);
|
|
|
|
keys[gid] = key;
|
|
values1[gid] = value1;
|
|
values2[gid] = value2;
|
|
}
|
|
|
|
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size)
|
|
{
|
|
int i = 0;
|
|
const unsigned int bitSize = sizeof(unsigned int)*8;
|
|
const unsigned int bitStep = 4;
|
|
const int gSize = size/cell_size;
|
|
while (bitSize > i*bitStep) {
|
|
radixSortBlocks<<<gSize, cell_size, 2*cell_size*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
|
|
i++;
|
|
}
|
|
}
|
|
|
|
static float3 *d_pos, *pos_temp;
|
|
static int *d_type;
|
|
static int *d_overflow, *d_rebuild;
|
|
|
|
void init_cell_list(cell_list &cell_list_gpu,
|
|
const int nall,
|
|
const int ncell,
|
|
const int buffer)
|
|
{
|
|
cudaMalloc((void**)&(cell_list_gpu.pos), ncell*buffer*sizeof(float3));
|
|
cudaMalloc((void**)&(cell_list_gpu.idx), ncell*buffer*sizeof(unsigned int));
|
|
cudaMalloc((void**)&(cell_list_gpu.type), ncell*buffer*sizeof(int));
|
|
cudaMalloc((void**)&(cell_list_gpu.natom), ncell*sizeof(int));
|
|
|
|
cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
|
|
cudaMalloc((void**)&d_pos, nall*sizeof(float3));
|
|
cudaMalloc((void**)&d_type, nall*sizeof(int));
|
|
cudaMalloc((void**)&d_overflow, sizeof(int));
|
|
cudaMalloc((void**)&d_rebuild, sizeof(int));
|
|
|
|
cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
|
|
cudaMemset(cell_list_gpu.pos, 0, ncell*buffer*sizeof(float3));
|
|
}
|
|
|
|
void clear_cell_list(cell_list &cell_list_gpu)
|
|
{
|
|
cudaFree(cell_list_gpu.pos);
|
|
cudaFree(cell_list_gpu.idx);
|
|
cudaFree(cell_list_gpu.natom);
|
|
cudaFree(cell_list_gpu.type);
|
|
|
|
cudaFreeHost(pos_temp);
|
|
cudaFree(d_pos);
|
|
cudaFree(d_type);
|
|
cudaFree(d_overflow);
|
|
cudaFree(d_rebuild);
|
|
}
|
|
|
|
|
|
void build_cell_list(double *atom_pos, int *atom_type,
|
|
cell_list &cell_list_gpu,
|
|
const int ncell, const int ncellx, const int ncelly, const int ncellz,
|
|
const int buffer, const int inum, const int nall, const int ago)
|
|
{
|
|
|
|
cudaError_t err;
|
|
|
|
cudaMemset(d_overflow, 0, sizeof(int));
|
|
cudaMemset(d_rebuild, 0, sizeof(int));
|
|
|
|
// copy position and type to GPU
|
|
for (int i = 0; i < 3*nall; i+=3) {
|
|
pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
|
|
}
|
|
cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_type, atom_type, nall*sizeof(int), cudaMemcpyHostToDevice);
|
|
|
|
static int first_build = 1;
|
|
int rebuild = 0;
|
|
|
|
// copy the last built cell-list and test whether it needs to be rebuilt
|
|
if (!first_build) {
|
|
|
|
dim3 grid(ncellx, ncelly*ncellz);
|
|
kernel_copy_list<<<grid, buffer>>>(cell_list_gpu.pos,
|
|
cell_list_gpu.idx,
|
|
cell_list_gpu.natom, d_pos);
|
|
cudaMemset(d_rebuild, 0, sizeof(int));
|
|
kernel_test_rebuild<<<grid, buffer>>>(cell_list_gpu.pos,
|
|
cell_list_gpu.natom,
|
|
d_rebuild);
|
|
cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
|
|
|
|
err = cudaGetLastError();
|
|
assert(err == cudaSuccess);
|
|
}
|
|
|
|
if (ago == 0) rebuild = 1;
|
|
|
|
// build cell-list for the first time
|
|
if (first_build || rebuild) {
|
|
first_build = 0;
|
|
// cout << "Building cell list..." << endl;
|
|
cudaMemset(cell_list_gpu.natom, 0, ncell*sizeof(int));
|
|
// initialize d_cell_idx for the sorting routine
|
|
kernel_set_cell_list<<<ncell, buffer>>>(cell_list_gpu.idx);
|
|
|
|
// build cell list
|
|
dim3 blockDim(128);
|
|
dim3 gridDim(static_cast<int>(ceil(static_cast<double>(nall)/blockDim.x)));
|
|
kernel_build_cell_list<<<gridDim, blockDim>>>(cell_list_gpu.pos,
|
|
cell_list_gpu.idx,
|
|
cell_list_gpu.type,
|
|
cell_list_gpu.natom,
|
|
d_pos, d_type, inum, nall, buffer);
|
|
err = cudaGetLastError();
|
|
assert(err == cudaSuccess);
|
|
// check cell list overflow
|
|
int overflow = 0;
|
|
int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/buffer));
|
|
kernel_test_overflow<<<gDimCell, buffer>>>(cell_list_gpu.natom,
|
|
d_overflow, ncell);
|
|
cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
|
|
|
|
if (overflow > 0) {
|
|
printf("\n BLOCK_1D too small for cell list, please increase it!");
|
|
printf("\n BLOCK_1D = %d",BLOCK_1D);
|
|
printf("\n ncell = %d",ncell);
|
|
printf("\n gDimCell = %d",gDimCell);
|
|
printf("\n overflow = %d \n",overflow);
|
|
exit(0);
|
|
}
|
|
|
|
// sort atoms in every cell by atom index to avoid floating point associativity problem.
|
|
sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos,
|
|
cell_list_gpu.type, ncell*buffer, buffer);
|
|
|
|
cudaThreadSynchronize();
|
|
err = cudaGetLastError();
|
|
assert(err == cudaSuccess);
|
|
}
|
|
|
|
}
|