Improvements in lib/gpu

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3804 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
pscrozi 2010-02-11 20:46:34 +00:00
parent 8eaf196813
commit 71ed5c03ec
6 changed files with 180 additions and 363 deletions

View File

@ -27,29 +27,12 @@
#include "lj_gpu_memory.cu"
#include "lj_gpu_kernel.h"
#ifdef WINDLL
#include <windows.h>
BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved)
{
return TRUE;
}
#endif
#ifdef WINDLL
#define EXTERN extern "C" __declspec(dllexport)
#else
#define EXTERN
#endif
using namespace std;
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
static float kernelTime = 0.0;
static int ncell1D;
static float *energy, *d_energy;
static float3 *d_force, *f_temp, *v_temp, *d_virial;
static cell_list cell_list_gpu;
// ---------------------------------------------------------------------------
// Convert something to a string
@ -93,7 +76,9 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
init_cell_list_const(cell_size, skin, boxlo, boxhi);
@ -153,6 +138,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
const int nall, const int ago, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
cudaError_t err;
ljm.atom.nall(nall);
ljm.atom.inum(inum);
@ -161,8 +148,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
double evdwl=0.0;
static int buffer = CELL_SIZE;
static int ncell = (int)pow((float)ncell1D,3);
static int blockSize = BLOCK_1D;
static int ncell = ncellx*ncelly*ncellz;
static int first_call = 1;
@ -176,7 +163,7 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
cudaMalloc((void**)&d_energy, inum*sizeof(float));
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
init_cell_list(cell_list_gpu, nall, ncell, buffer);
init_cell_list(cell_list_gpu, nall, ncell, blockSize);
first_call = 0;
}
@ -198,13 +185,13 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
clear_cell_list(cell_list_gpu);
init_cell_list(cell_list_gpu, nall, ncell, buffer);
init_cell_list(cell_list_gpu, nall, ncell, blockSize);
}
// build cell-list on GPU
ljm.atom.time_atom.start();
build_cell_list(host_x[0], host_type, cell_list_gpu,
ncell, ncell1D, buffer, inum, nall, ago);
ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago);
ljm.atom.time_atom.stop();
ljm.time_pair.start();
@ -216,25 +203,32 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
cudaEventRecord(start, 0);
#endif
#define KERNEL_LJ_CELL(e, v, b, s) kernel_lj_cell<e,v,b><<<GX, BX, s>>> \
(d_force, d_energy, d_virial, \
cell_list_gpu.pos, \
cell_list_gpu.idx, \
cell_list_gpu.type, \
cell_list_gpu.natom, \
inum, nall, ncell, ncellx, ncelly, ncellz);
// call the cell-list force kernel
const int BX=BLOCK_1D;
dim3 GX(ncell1D, ncell1D*ncell1D);
const int BX=blockSize;
dim3 GX(ncellx, ncelly*ncellz);
if (eflag == 0 && vflag == 0) {
kernel_lj_cell<false,false><<<GX, BX, 0>>>
(d_force, d_energy, d_virial,
cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.type,
cell_list_gpu.natom,
inum, nall, ncell);
if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64, 0);
if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0);
if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0);
} else {
kernel_lj_cell<true,true><<<GX, BX, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES>>>
(d_force, d_energy, d_virial,
cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.type,
cell_list_gpu.natom,
inum, nall, ncell);
if (blockSize == 64) KERNEL_LJ_CELL(true, true, 64, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
}
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("LJ force kernel launch error: %d\n", err);
exit(1);
}
#ifdef TIMING
@ -296,122 +290,6 @@ EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *
ago, eflag, vflag, boxlo, boxhi);
}
template <class numtyp, class acctyp>
double _lj_gpu_n2(LJMT &ljm, double **force, double *virial,
double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
ljm.atom.nall(nall);
ljm.atom.inum(inum);
ljm.nbor.time_nbor.start();
ljm.nbor.time_nbor.stop();
double evdwl=0.0;
#ifdef NOUSE
static int first_call = 1;
if (first_call) {
energy = (float*) malloc(inum*sizeof(float));
v_temp = (float3*) malloc(inum*2*sizeof(float3));
cudaMallocHost((void**)&f_temp, inum*sizeof(float3));
cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
cudaMalloc((void**)&d_force, inum*sizeof(float3));
cudaMalloc((void**)&d_energy, inum*sizeof(float));
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
cudaMalloc((void**)&d_pos, nall*sizeof(float3));
cudaMalloc((void**)&d_type, nall*sizeof(int));
first_call = 0;
}
ljm.atom.time_atom.start();
double *atom_pos = host_x[0];
for (int i = 0; i < 3*nall; i+=3) {
pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
}
cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
cudaMemcpy(d_type, host_type, nall*sizeof(int), cudaMemcpyHostToDevice);
ljm.atom.time_atom.stop();
ljm.time_pair.start();
// Compute the block size and grid size to keep all cores busy
const int BX=BLOCK_1D;
dim3 GX(static_cast<int>(ceil(static_cast<double>(inum)/BX)));
#ifdef TIMING
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
// N^2 force kernel
kernel_lj_n2<numtyp, acctyp><<<GX, BX>>>(d_force, d_energy, d_virial,
d_pos, d_type, eflag, vflag, inum, nall);
#ifdef TIMING
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float kTime;
cudaEventElapsedTime(&kTime, start, stop);
kernelTime += kTime;
printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
cudaEventDestroy(start);
cudaEventDestroy(stop);
#endif
// copy results from GPU to CPU
cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
if (eflag) {
cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < inum; i++) {
evdwl += energy[i];
}
evdwl *= 0.5f;
}
if (vflag) {
cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
for (int i = 0; i < inum; i++) {
virial[0] += v_temp[2*i].x;
virial[1] += v_temp[2*i].y;
virial[2] += v_temp[2*i].z;
virial[3] += v_temp[2*i+1].x;
virial[4] += v_temp[2*i+1].y;
virial[5] += v_temp[2*i+1].z;
}
for (int i = 0; i < 6; i++)
virial[i] *= 0.5f;
}
for (int i = 0; i < inum; i++) {
force[i][0] += f_temp[i].x;
force[i][1] += f_temp[i].y;
force[i][2] += f_temp[i].z;
}
#endif
ljm.time_pair.stop();
ljm.atom.time_atom.add_to_total();
ljm.nbor.time_nbor.add_to_total();
ljm.time_pair.add_to_total();
return evdwl;
}
EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
const bool eflag, const bool vflag,
const double *boxlo, const double *boxhi)
{
return _lj_gpu_n2<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall,
eflag, vflag, boxlo, boxhi);
}
EXTERN void lj_gpu_time() {
cout.precision(4);
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";

View File

@ -21,25 +21,29 @@
#define LJ_GPU_KERNEL
/* Cell list version of LJ kernel */
template<bool eflag, bool vflag>
template<bool eflag, bool vflag, int blockSize>
__global__ void kernel_lj_cell(float3 *force3,
float *energy, float3 *virial,
float3 *cell_list, unsigned int *cell_idx,
int *cell_type, int *cell_atom,
const int inum, const int nall, const int ncell)
const int inum, const int nall, const int ncell,
const int ncellx, const int ncelly, const int ncellz)
{
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % gridDim.x;
int bz = blockIdx.y / gridDim.x;
int by = blockIdx.y % ncelly;
int bz = blockIdx.y / ncelly;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
__shared__ int typeSh[CELL_SIZE];
__shared__ float posSh[CELL_SIZE*3];
__shared__ int typeSh[blockSize];
__shared__ float posSh[blockSize*3];
__shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -51,7 +55,7 @@ __global__ void kernel_lj_cell(float3 *force3,
__shared__ float *offsetSh;
// load force parameters into shared memory
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
int itype = i/MAX_SHARED_TYPES;
int jtype = i%MAX_SHARED_TYPES;
cutsqSh[i] = _cutsq_<float>(itype,jtype);
@ -65,7 +69,7 @@ __global__ void kernel_lj_cell(float3 *force3,
lj3Sh = smem;
lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
int itype = i/MAX_SHARED_TYPES;
int jtype = i%MAX_SHARED_TYPES;
lj3Sh[i] = _lj3_<float>(itype,jtype).x+0.01;
@ -76,41 +80,41 @@ __global__ void kernel_lj_cell(float3 *force3,
__syncthreads();
int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1),
nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1),
nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1);
int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1),
nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1),
nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1);
for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) {
for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) {
float3 f = {0.0f, 0.0f, 0.0f};
float ener = 0.0f;
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
int itype;
float ix, iy, iz;
int i = tid + ii*BLOCK_1D;
unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i];
int i = tid + ii*blockSize;
unsigned int answer_pos = cell_idx[cid*blockSize+i];
// load current cell atom position and type into sMem
for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) {
int pid = cid*CELL_SIZE + j;
for (int j = tid; j < cell_atom[cid]; j += blockSize) {
int pid = cid*blockSize + j;
float3 pos = cell_list[pid];
posSh[j ] = pos.x;
posSh[j+ CELL_SIZE] = pos.y;
posSh[j+2*CELL_SIZE] = pos.z;
posSh[j+ blockSize] = pos.y;
posSh[j+2*blockSize] = pos.z;
typeSh[j] = cell_type[pid];
}
__syncthreads();
if (answer_pos < inum) {
itype = typeSh[i];
ix = posSh[i ];
iy = posSh[i+ CELL_SIZE];
iz = posSh[i+2*CELL_SIZE];
iy = posSh[i+ blockSize];
iz = posSh[i+2*blockSize];
// compute force from current cell
for (int j = 0; j < cell_atom[cid]; j++) {
if (j == i) continue;
float delx = ix - posSh[j ];
float dely = iy - posSh[j+ CELL_SIZE];
float delz = iz - posSh[j+2*CELL_SIZE];
float dely = iy - posSh[j+ blockSize];
float delz = iz - posSh[j+2*blockSize];
int jtype = typeSh[j];
int mtype = itype + jtype*MAX_SHARED_TYPES;
float r2inv = delx*delx + dely*dely + delz*delz;
@ -149,16 +153,16 @@ __global__ void kernel_lj_cell(float3 *force3,
if (nborz == bz && nbory == by && nborx == bx) continue;
// compute cell id
int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) +
INT_MUL(nborz,gridDim.x*gridDim.x);
int cid_nbor = nborx + INT_MUL(nbory,ncellx) +
INT_MUL(nborz,INT_MUL(ncellx,ncelly));
// load neighbor cell position and type into smem
for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) {
int pid = INT_MUL(cid_nbor,CELL_SIZE) + j;
for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) {
int pid = INT_MUL(cid_nbor,blockSize) + j;
float3 pos = cell_list[pid];
posSh[j ] = pos.x;
posSh[j+ CELL_SIZE] = pos.y;
posSh[j+2*CELL_SIZE] = pos.z;
posSh[j+ blockSize] = pos.y;
posSh[j+2*blockSize] = pos.z;
typeSh[j] = cell_type[pid];
}
__syncthreads();
@ -166,8 +170,8 @@ __global__ void kernel_lj_cell(float3 *force3,
if (answer_pos < inum) {
for (int j = 0; j < cell_atom[cid_nbor]; j++) {
float delx = ix - posSh[j ];
float dely = iy - posSh[j+ CELL_SIZE];
float delz = iz - posSh[j+2*CELL_SIZE];
float dely = iy - posSh[j+ blockSize];
float delz = iz - posSh[j+2*blockSize];
int jtype = typeSh[j];
int mtype = itype + jtype*MAX_SHARED_TYPES;
float r2inv = delx*delx + dely*dely + delz*delz;
@ -438,116 +442,4 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
} // if ii
}
/* Brute force O(N^2) version of LJ kernel */
template<class numtyp, class acctyp>
__global__ void kernel_lj_n2(float3 *force3,
float *energy, float3 *virial,
float3 *pos, int *type,
const bool eflag, const bool vflag, const int inum, const int nall)
{
int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x);
int tid = threadIdx.x;
__shared__ float posSh[BLOCK_1D*3];
__shared__ int typeSh[BLOCK_1D];
__shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
int itype=tid/MAX_SHARED_TYPES;
int jtype=tid%MAX_SHARED_TYPES;
cutsqSh[tid]=_cutsq_<numtyp>(itype,jtype);
lj1Sh[tid]=_lj1_<numtyp>(itype,jtype).x;
lj2Sh[tid]=_lj1_<numtyp>(itype,jtype).y;
lj3Sh[tid]=_lj3_<numtyp>(itype,jtype).x;
lj4Sh[tid]=_lj3_<numtyp>(itype,jtype).y;
offsetSh[tid]=_offset_<numtyp>(itype,jtype);
}
__syncthreads();
float3 f = {0.0f, 0.0f, 0.0f};
float ener = 0.0f;
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
int itype, jtype;
int mtype;
numtyp ix, iy, iz;
if (gid < inum) {
ix = pos[gid].x;
iy = pos[gid].y;
iz = pos[gid].z;
itype = type[gid];
}
int pid = tid;
int nIter = ceil((float)nall/BLOCK_1D);
for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) {
if (pid < nall) {
posSh[tid ] = pos[pid].x;
posSh[tid+ BLOCK_1D] = pos[pid].y;
posSh[tid+2*BLOCK_1D] = pos[pid].z;
typeSh[tid] = type[pid];
}
__syncthreads();
if (gid < inum) {
int pid_j = jj*BLOCK_1D;
for (int j = 0; j < BLOCK_1D; j++, pid_j++) {
if (jj == blockIdx.x && tid == j) continue;
if (pid_j < nall) {
numtyp delx = ix - posSh[j ];
numtyp dely = iy - posSh[j+ BLOCK_1D];
numtyp delz = iz - posSh[j+2*BLOCK_1D];
jtype = typeSh[j];
mtype = itype + jtype*MAX_SHARED_TYPES;
numtyp r2inv = delx * delx + dely * dely + delz * delz;
if (r2inv < cutsqSh[mtype]) {
r2inv = (numtyp)1.0/r2inv;
numtyp r6inv = r2inv * r2inv * r2inv;
numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
f.x += delx * force;
f.y += dely * force;
f.z += delz * force;
if (eflag) {
numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
ener +=(e-offsetSh[mtype]);
}
if (vflag) {
v0.x += delx*delx*force;
v0.y += dely*dely*force;
v0.z += delz*delz*force;
v1.x += delx*dely*force;
v1.y += delx*delz*force;
v1.z += dely*delz*force;
}
}
}
}
}
__syncthreads();
}
if (gid < inum) {
if (eflag)
energy[gid] = ener;
if (vflag) {
virial[2*gid ] = v0;
virial[2*gid+1] = v1;
}
force3[gid] = f;
}
}
#endif

View File

@ -25,8 +25,8 @@
#include "pair_gpu_atom.h"
#include "pair_gpu_nbor.h"
#define BLOCK_1D 64
#define CELL_SIZE 64
#define BLOCK_1D 64 // max value = 256
#define CELL_SIZE BLOCK_1D
#define MAX_SHARED_TYPES 8
#define PERCENT_GPU_MEMORY 0.7
#define BIG_NUMBER 100000000

View File

@ -17,6 +17,7 @@
Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
#include <assert.h>
#include "lj_gpu_memory.h"
#include "pair_gpu_cell.h"
@ -60,11 +61,14 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
float3 *pos,
int *type,
const int inum,
const int nall)
const int nall,
const int cell_size)
{
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
float cSize = d_cell_size[0];
int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
if (gid < nall) {
float3 p = pos[gid];
@ -75,11 +79,11 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
p.z = fminf(p.z, d_boxhi[2]+cSize);
int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D
+ (int)(p.z/cSize + 1.0) * ncell1D * ncell1D;
int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx
+ (int)(p.z/cSize + 1.0) * ncellx * ncelly;
int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
int pid = cell_id*CELL_SIZE + atom_pos;
int pid = cell_id*cell_size + atom_pos;
cell_list[pid] = pos[gid];
cell_type[pid] = type[gid];
@ -92,18 +96,20 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
{
float cSize = d_cell_size[0];
int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % gridDim.x;
int bz = blockIdx.y / gridDim.x;
int by = blockIdx.y % ncelly;
int bz = blockIdx.y / ncelly;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
float skin = d_skin[0];
float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
@ -113,7 +119,7 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
float hiz = lowz + cSize + skin;
for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
int pid = pbase + i;
float3 p = cell_list[pid];
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
@ -136,25 +142,30 @@ __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int nc
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
if (gid < ncell) {
if (cell_atom[gid] > CELL_SIZE)
if (cell_atom[gid] > blockDim.x)
*overflow = 1;
}
}
__global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
{
float cSize = d_cell_size[0];
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
// calculate 3D block idx from 2d block
int bx = blockIdx.x;
int by = blockIdx.y % gridDim.x;
int bz = blockIdx.y / gridDim.x;
int by = blockIdx.y % ncelly;
int bz = blockIdx.y / ncelly;
int tid = threadIdx.x;
// compute cell idx from 3D block idx
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
int pid = pbase + i;
cell_list[pid] = pos[cell_idx[pid]];
}
@ -164,17 +175,7 @@ __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit);
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size)
{
int i = 0;
const unsigned int bitSize = sizeof(unsigned int)*8;
const unsigned int bitStep = 4;
const int gSize = size/BLOCK_1D;
while (bitSize > i*bitStep) {
radixSortBlocks<<<gSize, BLOCK_1D, 2*BLOCK_1D*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
i++;
}
}
#ifdef __DEVICE_EMULATION__
#define __SYNC __syncthreads();
@ -253,7 +254,7 @@ __device__ unsigned int rank(unsigned int preds)
unsigned int address = scan(preds);
__shared__ unsigned int numtrue;
if (threadIdx.x == BLOCK_1D - 1)
if (threadIdx.x == blockDim.x - 1)
{
numtrue = address + preds;
}
@ -266,11 +267,12 @@ __device__ unsigned int rank(unsigned int preds)
return rank;
}
template<int blockSize>
__device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
{
extern __shared__ unsigned int sMem1[];
__shared__ float sMem2[BLOCK_1D];
__shared__ int sMem3[BLOCK_1D];
__shared__ float sMem2[blockSize];
__shared__ int sMem3[blockSize];
int tid = threadIdx.x;
@ -314,7 +316,11 @@ __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, u
}
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit)
__global__ void radixSortBlocks(unsigned int *keys,
float3 *values1,
int *values2,
unsigned int nbits,
unsigned int startbit)
{
extern __shared__ unsigned int sMem[];
@ -328,13 +334,30 @@ __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values
value2 = values2[gid];
__syncthreads();
radixSortBlock(&key, &value1, &value2, nbits, startbit);
if (blockDim.x == 64)
radixSortBlock<64>(&key, &value1, &value2, nbits, startbit);
else if (blockDim.x == 128)
radixSortBlock<128>(&key, &value1, &value2, nbits, startbit);
else if (blockDim.x == 256)
radixSortBlock<256>(&key, &value1, &value2, nbits, startbit);
keys[gid] = key;
values1[gid] = value1;
values2[gid] = value2;
}
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size)
{
int i = 0;
const unsigned int bitSize = sizeof(unsigned int)*8;
const unsigned int bitStep = 4;
const int gSize = size/cell_size;
while (bitSize > i*bitStep) {
radixSortBlocks<<<gSize, cell_size, 2*cell_size*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
i++;
}
}
static float3 *d_pos, *pos_temp;
static int *d_type;
static int *d_overflow, *d_rebuild;
@ -376,9 +399,12 @@ void clear_cell_list(cell_list &cell_list_gpu)
void build_cell_list(double *atom_pos, int *atom_type,
cell_list &cell_list_gpu,
const int ncell, const int ncell1D, const int buffer,
const int inum, const int nall, const int ago)
const int ncell, const int ncellx, const int ncelly, const int ncellz,
const int buffer, const int inum, const int nall, const int ago)
{
cudaError_t err;
cudaMemset(d_overflow, 0, sizeof(int));
cudaMemset(d_rebuild, 0, sizeof(int));
@ -394,26 +420,22 @@ void build_cell_list(double *atom_pos, int *atom_type,
// copy the last built cell-list and test whether it needs to be rebuilt
if (!first_build) {
dim3 block(BLOCK_1D);
dim3 grid(ncell1D, ncell1D*ncell1D);
kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos,
dim3 grid(ncellx, ncelly*ncellz);
kernel_copy_list<<<grid, buffer>>>(cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.natom, d_pos);
cudaMemset(d_rebuild, 0, sizeof(int));
int *temp = (int*)malloc(sizeof(int)*ncell);
kernel_test_rebuild<<<grid, block>>>(cell_list_gpu.pos,
kernel_test_rebuild<<<grid, buffer>>>(cell_list_gpu.pos,
cell_list_gpu.natom,
d_rebuild);
cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
err = cudaGetLastError();
assert(err == cudaSuccess);
}
/*if (!first_build) {
dim3 block(BLOCK_1D);
dim3 grid(ncell1D, ncell1D*ncell1D);
kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos,
cell_list_gpu.idx,
cell_list_gpu.natom, d_pos);
}*/
if (ago == 0) rebuild = 1;
// build cell-list for the first time
@ -431,24 +453,32 @@ void build_cell_list(double *atom_pos, int *atom_type,
cell_list_gpu.idx,
cell_list_gpu.type,
cell_list_gpu.natom,
d_pos, d_type, inum, nall);
d_pos, d_type, inum, nall, buffer);
err = cudaGetLastError();
assert(err == cudaSuccess);
// check cell list overflow
int overflow;
int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/BLOCK_1D));
kernel_test_overflow<<<gDimCell, BLOCK_1D>>>(cell_list_gpu.natom,
int overflow = 0;
int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/buffer));
kernel_test_overflow<<<gDimCell, buffer>>>(cell_list_gpu.natom,
d_overflow, ncell);
cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
if (overflow > 0) {
printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n");
printf("\n BLOCK_1D too small for cell list, please increase it!");
printf("\n BLOCK_1D = %d",BLOCK_1D);
printf("\n ncell = %d",ncell);
printf("\n gDimCell = %d",gDimCell);
printf("\n overflow = %d \n",overflow);
exit(0);
}
// sort atoms in every cell by atom index to avoid floating point associativity problem.
sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos,
cell_list_gpu.type, ncell*buffer);
cell_list_gpu.type, ncell*buffer, buffer);
cudaThreadSynchronize();
err = cudaGetLastError();
assert(err == cudaSuccess);
}
}

View File

@ -20,6 +20,22 @@
#ifndef PAIR_GPU_CELL_H
#define PAIR_GPU_CELL_H
#ifdef WINDLL
#include <windows.h>
#endif
#ifdef WINDLL
#define EXTERN extern "C" __declspec(dllexport)
#else
#define EXTERN
#endif
using namespace std;
static float kernelTime = 0.0;
static int ncellx, ncelly, ncellz;
static float *energy, *d_energy;
static float3 *d_force, *f_temp, *v_temp, *d_virial;
typedef struct {
float3 *pos;
@ -28,6 +44,8 @@ typedef struct {
int *natom;
} cell_list;
static cell_list cell_list_gpu;
__global__ void kernel_set_cell_list(unsigned int *cell_idx);
__global__ void kernel_build_cell_list(float3 *cell_list,
unsigned int *cell_idx,
@ -54,8 +72,8 @@ void init_cell_list(cell_list &cell_list_gpu,
void build_cell_list(double *atom_pos, int *atom_type,
cell_list &cell_list_gpu,
const int ncell, const int ncell1D, const int buffer,
const int inum, const int nall, const int ago);
const int ncell, const int ncellx, const int ncelly, const int ncellz,
const int buffer, const int inum, const int nall, const int ago);
void clear_cell_list(cell_list &cell_list_gpu);

View File

@ -24,7 +24,6 @@
#include "nvc_timer.h"
#include "nvc_memory.h"
#define BLOCK_1D 64
#define IJ_SIZE 131072
class PairGPUNbor {