diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu index e8791069d5..e31321ad2e 100644 --- a/lib/gpu/lj_gpu.cu +++ b/lib/gpu/lj_gpu.cu @@ -27,29 +27,12 @@ #include "lj_gpu_memory.cu" #include "lj_gpu_kernel.h" -#ifdef WINDLL -#include -BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved) -{ - return TRUE; -} -#endif -#ifdef WINDLL -#define EXTERN extern "C" __declspec(dllexport) -#else -#define EXTERN -#endif -using namespace std; static LJ_GPU_Memory LJMF; #define LJMT LJ_GPU_Memory -static float kernelTime = 0.0; -static int ncell1D; -static float *energy, *d_energy; -static float3 *d_force, *f_temp, *v_temp, *d_virial; -static cell_list cell_list_gpu; + // --------------------------------------------------------------------------- // Convert something to a string @@ -93,8 +76,10 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double ** bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id); - ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size); - + ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size); + ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size); + ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size); + init_cell_list_const(cell_size, skin, boxlo, boxhi); return ret; @@ -153,6 +138,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, const int nall, const int ago, const bool eflag, const bool vflag, const double *boxlo, const double *boxhi) { + cudaError_t err; + ljm.atom.nall(nall); ljm.atom.inum(inum); @@ -161,8 +148,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, double evdwl=0.0; - static int buffer = CELL_SIZE; - static int ncell = (int)pow((float)ncell1D,3); + static int blockSize = BLOCK_1D; + static int ncell = ncellx*ncelly*ncellz; static int first_call = 1; @@ -176,7 +163,7 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, cudaMalloc((void**)&d_energy, inum*sizeof(float)); cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)); - init_cell_list(cell_list_gpu, nall, ncell, buffer); + init_cell_list(cell_list_gpu, nall, ncell, blockSize); first_call = 0; } @@ -198,13 +185,13 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)); clear_cell_list(cell_list_gpu); - init_cell_list(cell_list_gpu, nall, ncell, buffer); + init_cell_list(cell_list_gpu, nall, ncell, blockSize); } // build cell-list on GPU ljm.atom.time_atom.start(); build_cell_list(host_x[0], host_type, cell_list_gpu, - ncell, ncell1D, buffer, inum, nall, ago); + ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago); ljm.atom.time_atom.stop(); ljm.time_pair.start(); @@ -216,25 +203,32 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, cudaEventRecord(start, 0); #endif +#define KERNEL_LJ_CELL(e, v, b, s) kernel_lj_cell<<>> \ + (d_force, d_energy, d_virial, \ + cell_list_gpu.pos, \ + cell_list_gpu.idx, \ + cell_list_gpu.type, \ + cell_list_gpu.natom, \ + inum, nall, ncell, ncellx, ncelly, ncellz); + // call the cell-list force kernel - const int BX=BLOCK_1D; - dim3 GX(ncell1D, ncell1D*ncell1D); + const int BX=blockSize; + dim3 GX(ncellx, ncelly*ncellz); + if (eflag == 0 && vflag == 0) { - kernel_lj_cell<<>> - (d_force, d_energy, d_virial, - cell_list_gpu.pos, - cell_list_gpu.idx, - cell_list_gpu.type, - cell_list_gpu.natom, - inum, nall, ncell); + if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64, 0); + if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0); + if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0); } else { - kernel_lj_cell<<>> - (d_force, d_energy, d_virial, - cell_list_gpu.pos, - cell_list_gpu.idx, - cell_list_gpu.type, - cell_list_gpu.natom, - inum, nall, ncell); + if (blockSize == 64) KERNEL_LJ_CELL(true, true, 64, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES); + if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES); + if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES); + } + + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("LJ force kernel launch error: %d\n", err); + exit(1); } #ifdef TIMING @@ -296,122 +290,6 @@ EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int * ago, eflag, vflag, boxlo, boxhi); } -template -double _lj_gpu_n2(LJMT &ljm, double **force, double *virial, - double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) -{ - ljm.atom.nall(nall); - ljm.atom.inum(inum); - - - ljm.nbor.time_nbor.start(); - ljm.nbor.time_nbor.stop(); - - - double evdwl=0.0; - -#ifdef NOUSE - static int first_call = 1; - - if (first_call) { - energy = (float*) malloc(inum*sizeof(float)); - v_temp = (float3*) malloc(inum*2*sizeof(float3)); - cudaMallocHost((void**)&f_temp, inum*sizeof(float3)); - cudaMallocHost((void**)&pos_temp, nall*sizeof(float3)); - cudaMalloc((void**)&d_force, inum*sizeof(float3)); - cudaMalloc((void**)&d_energy, inum*sizeof(float)); - cudaMalloc((void**)&d_virial, inum*3*sizeof(float3)); - cudaMalloc((void**)&d_pos, nall*sizeof(float3)); - cudaMalloc((void**)&d_type, nall*sizeof(int)); - first_call = 0; - } - - - ljm.atom.time_atom.start(); - double *atom_pos = host_x[0]; - for (int i = 0; i < 3*nall; i+=3) { - pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]); - } - cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice); - cudaMemcpy(d_type, host_type, nall*sizeof(int), cudaMemcpyHostToDevice); - - ljm.atom.time_atom.stop(); - - ljm.time_pair.start(); - - // Compute the block size and grid size to keep all cores busy - const int BX=BLOCK_1D; - dim3 GX(static_cast(ceil(static_cast(inum)/BX))); - -#ifdef TIMING - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, 0); -#endif - - // N^2 force kernel - kernel_lj_n2<<>>(d_force, d_energy, d_virial, - d_pos, d_type, eflag, vflag, inum, nall); - -#ifdef TIMING - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - float kTime; - cudaEventElapsedTime(&kTime, start, stop); - kernelTime += kTime; - printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag); - cudaEventDestroy(start); - cudaEventDestroy(stop); -#endif - - // copy results from GPU to CPU - cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost); - if (eflag) { - cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost); - for (int i = 0; i < inum; i++) { - evdwl += energy[i]; - } - evdwl *= 0.5f; - } - if (vflag) { - cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost); - for (int i = 0; i < inum; i++) { - virial[0] += v_temp[2*i].x; - virial[1] += v_temp[2*i].y; - virial[2] += v_temp[2*i].z; - virial[3] += v_temp[2*i+1].x; - virial[4] += v_temp[2*i+1].y; - virial[5] += v_temp[2*i+1].z; - } - for (int i = 0; i < 6; i++) - virial[i] *= 0.5f; - } - - for (int i = 0; i < inum; i++) { - force[i][0] += f_temp[i].x; - force[i][1] += f_temp[i].y; - force[i][2] += f_temp[i].z; - } -#endif - ljm.time_pair.stop(); - - ljm.atom.time_atom.add_to_total(); - ljm.nbor.time_nbor.add_to_total(); - ljm.time_pair.add_to_total(); - - return evdwl; -} - -EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, - const bool eflag, const bool vflag, - const double *boxlo, const double *boxhi) -{ - return _lj_gpu_n2(LJMF, force, virial, host_x, host_type, inum, nall, - eflag, vflag, boxlo, boxhi); -} - EXTERN void lj_gpu_time() { cout.precision(4); cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n"; diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h index 7af33ce6de..f783616a83 100644 --- a/lib/gpu/lj_gpu_kernel.h +++ b/lib/gpu/lj_gpu_kernel.h @@ -21,25 +21,29 @@ #define LJ_GPU_KERNEL /* Cell list version of LJ kernel */ -template +template __global__ void kernel_lj_cell(float3 *force3, float *energy, float3 *virial, float3 *cell_list, unsigned int *cell_idx, int *cell_type, int *cell_atom, - const int inum, const int nall, const int ncell) + const int inum, const int nall, const int ncell, + const int ncellx, const int ncelly, const int ncellz) { + + + // calculate 3D block idx from 2d block int bx = blockIdx.x; - int by = blockIdx.y % gridDim.x; - int bz = blockIdx.y / gridDim.x; + int by = blockIdx.y % ncelly; + int bz = blockIdx.y / ncelly; int tid = threadIdx.x; - + // compute cell idx from 3D block idx - int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x); - - __shared__ int typeSh[CELL_SIZE]; - __shared__ float posSh[CELL_SIZE*3]; + int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); + + __shared__ int typeSh[blockSize]; + __shared__ float posSh[blockSize*3]; __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; @@ -51,7 +55,7 @@ __global__ void kernel_lj_cell(float3 *force3, __shared__ float *offsetSh; // load force parameters into shared memory - for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) { + for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) { int itype = i/MAX_SHARED_TYPES; int jtype = i%MAX_SHARED_TYPES; cutsqSh[i] = _cutsq_(itype,jtype); @@ -65,7 +69,7 @@ __global__ void kernel_lj_cell(float3 *force3, lj3Sh = smem; lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES; offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES; - for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) { + for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) { int itype = i/MAX_SHARED_TYPES; int jtype = i%MAX_SHARED_TYPES; lj3Sh[i] = _lj3_(itype,jtype).x+0.01; @@ -76,41 +80,41 @@ __global__ void kernel_lj_cell(float3 *force3, __syncthreads(); - int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1), - nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1), - nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1); + int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1), + nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1), + nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1); - for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) { + for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) { float3 f = {0.0f, 0.0f, 0.0f}; float ener = 0.0f; float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f}; int itype; float ix, iy, iz; - int i = tid + ii*BLOCK_1D; - unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i]; + int i = tid + ii*blockSize; + unsigned int answer_pos = cell_idx[cid*blockSize+i]; // load current cell atom position and type into sMem - for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) { - int pid = cid*CELL_SIZE + j; + for (int j = tid; j < cell_atom[cid]; j += blockSize) { + int pid = cid*blockSize + j; float3 pos = cell_list[pid]; posSh[j ] = pos.x; - posSh[j+ CELL_SIZE] = pos.y; - posSh[j+2*CELL_SIZE] = pos.z; + posSh[j+ blockSize] = pos.y; + posSh[j+2*blockSize] = pos.z; typeSh[j] = cell_type[pid]; } __syncthreads(); if (answer_pos < inum) { itype = typeSh[i]; ix = posSh[i ]; - iy = posSh[i+ CELL_SIZE]; - iz = posSh[i+2*CELL_SIZE]; + iy = posSh[i+ blockSize]; + iz = posSh[i+2*blockSize]; // compute force from current cell for (int j = 0; j < cell_atom[cid]; j++) { if (j == i) continue; float delx = ix - posSh[j ]; - float dely = iy - posSh[j+ CELL_SIZE]; - float delz = iz - posSh[j+2*CELL_SIZE]; + float dely = iy - posSh[j+ blockSize]; + float delz = iz - posSh[j+2*blockSize]; int jtype = typeSh[j]; int mtype = itype + jtype*MAX_SHARED_TYPES; float r2inv = delx*delx + dely*dely + delz*delz; @@ -149,16 +153,16 @@ __global__ void kernel_lj_cell(float3 *force3, if (nborz == bz && nbory == by && nborx == bx) continue; // compute cell id - int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) + - INT_MUL(nborz,gridDim.x*gridDim.x); + int cid_nbor = nborx + INT_MUL(nbory,ncellx) + + INT_MUL(nborz,INT_MUL(ncellx,ncelly)); // load neighbor cell position and type into smem - for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) { - int pid = INT_MUL(cid_nbor,CELL_SIZE) + j; + for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) { + int pid = INT_MUL(cid_nbor,blockSize) + j; float3 pos = cell_list[pid]; posSh[j ] = pos.x; - posSh[j+ CELL_SIZE] = pos.y; - posSh[j+2*CELL_SIZE] = pos.z; + posSh[j+ blockSize] = pos.y; + posSh[j+2*blockSize] = pos.z; typeSh[j] = cell_type[pid]; } __syncthreads(); @@ -166,8 +170,8 @@ __global__ void kernel_lj_cell(float3 *force3, if (answer_pos < inum) { for (int j = 0; j < cell_atom[cid_nbor]; j++) { float delx = ix - posSh[j ]; - float dely = iy - posSh[j+ CELL_SIZE]; - float delz = iz - posSh[j+2*CELL_SIZE]; + float dely = iy - posSh[j+ blockSize]; + float delz = iz - posSh[j+2*blockSize]; int jtype = typeSh[j]; int mtype = itype + jtype*MAX_SHARED_TYPES; float r2inv = delx*delx + dely*dely + delz*delz; @@ -438,116 +442,4 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, } // if ii } - -/* Brute force O(N^2) version of LJ kernel */ -template - __global__ void kernel_lj_n2(float3 *force3, - float *energy, float3 *virial, - float3 *pos, int *type, - const bool eflag, const bool vflag, const int inum, const int nall) -{ - int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x); - int tid = threadIdx.x; - __shared__ float posSh[BLOCK_1D*3]; - __shared__ int typeSh[BLOCK_1D]; - __shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - - if (tid(itype,jtype); - lj1Sh[tid]=_lj1_(itype,jtype).x; - lj2Sh[tid]=_lj1_(itype,jtype).y; - lj3Sh[tid]=_lj3_(itype,jtype).x; - lj4Sh[tid]=_lj3_(itype,jtype).y; - offsetSh[tid]=_offset_(itype,jtype); - } - __syncthreads(); - - float3 f = {0.0f, 0.0f, 0.0f}; - float ener = 0.0f; - float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f}; - - int itype, jtype; - int mtype; - - numtyp ix, iy, iz; - - if (gid < inum) { - ix = pos[gid].x; - iy = pos[gid].y; - iz = pos[gid].z; - itype = type[gid]; - } - - int pid = tid; - int nIter = ceil((float)nall/BLOCK_1D); - for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) { - - if (pid < nall) { - posSh[tid ] = pos[pid].x; - posSh[tid+ BLOCK_1D] = pos[pid].y; - posSh[tid+2*BLOCK_1D] = pos[pid].z; - typeSh[tid] = type[pid]; - } - __syncthreads(); - - if (gid < inum) { - int pid_j = jj*BLOCK_1D; - - for (int j = 0; j < BLOCK_1D; j++, pid_j++) { - if (jj == blockIdx.x && tid == j) continue; - if (pid_j < nall) { - numtyp delx = ix - posSh[j ]; - numtyp dely = iy - posSh[j+ BLOCK_1D]; - numtyp delz = iz - posSh[j+2*BLOCK_1D]; - jtype = typeSh[j]; - mtype = itype + jtype*MAX_SHARED_TYPES; - numtyp r2inv = delx * delx + dely * dely + delz * delz; - - if (r2inv < cutsqSh[mtype]) { - r2inv = (numtyp)1.0/r2inv; - numtyp r6inv = r2inv * r2inv * r2inv; - numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]); - f.x += delx * force; - f.y += dely * force; - f.z += delz * force; - - if (eflag) { - numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]); - ener +=(e-offsetSh[mtype]); - } - if (vflag) { - v0.x += delx*delx*force; - v0.y += dely*dely*force; - v0.z += delz*delz*force; - v1.x += delx*dely*force; - v1.y += delx*delz*force; - v1.z += dely*delz*force; - } - } - } - } - } - - __syncthreads(); - } - - if (gid < inum) { - if (eflag) - energy[gid] = ener; - if (vflag) { - virial[2*gid ] = v0; - virial[2*gid+1] = v1; - } - force3[gid] = f; - } - -} - #endif diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h index 35057bcfad..f5a407ba0b 100644 --- a/lib/gpu/lj_gpu_memory.h +++ b/lib/gpu/lj_gpu_memory.h @@ -25,8 +25,8 @@ #include "pair_gpu_atom.h" #include "pair_gpu_nbor.h" -#define BLOCK_1D 64 -#define CELL_SIZE 64 +#define BLOCK_1D 64 // max value = 256 +#define CELL_SIZE BLOCK_1D #define MAX_SHARED_TYPES 8 #define PERCENT_GPU_MEMORY 0.7 #define BIG_NUMBER 100000000 diff --git a/lib/gpu/pair_gpu_cell.cu b/lib/gpu/pair_gpu_cell.cu index 11c78e87d6..2f167df15f 100644 --- a/lib/gpu/pair_gpu_cell.cu +++ b/lib/gpu/pair_gpu_cell.cu @@ -17,6 +17,7 @@ Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ +#include #include "lj_gpu_memory.h" #include "pair_gpu_cell.h" @@ -60,11 +61,14 @@ __global__ void kernel_build_cell_list(float3 *cell_list, float3 *pos, int *type, const int inum, - const int nall) + const int nall, + const int cell_size) { unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; float cSize = d_cell_size[0]; - int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); + int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); + int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); + int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); if (gid < nall) { float3 p = pos[gid]; @@ -75,11 +79,11 @@ __global__ void kernel_build_cell_list(float3 *cell_list, p.z = fmaxf(p.z, d_boxlo[2]-cSize); p.z = fminf(p.z, d_boxhi[2]+cSize); - int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D - + (int)(p.z/cSize + 1.0) * ncell1D * ncell1D; + int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx + + (int)(p.z/cSize + 1.0) * ncellx * ncelly; int atom_pos = atomicAdd(&cell_atom[cell_id], 1); - int pid = cell_id*CELL_SIZE + atom_pos; + int pid = cell_id*cell_size + atom_pos; cell_list[pid] = pos[gid]; cell_type[pid] = type[gid]; @@ -92,18 +96,20 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu { float cSize = d_cell_size[0]; - int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); + int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); + int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); + int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); // calculate 3D block idx from 2d block int bx = blockIdx.x; - int by = blockIdx.y % gridDim.x; - int bz = blockIdx.y / gridDim.x; + int by = blockIdx.y % ncelly; + int bz = blockIdx.y / ncelly; int tid = threadIdx.x; // compute cell idx from 3D block idx - int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x); - int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list + int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); + int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list float skin = d_skin[0]; float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin; @@ -113,7 +119,7 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin; float hiz = lowz + cSize + skin; - for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) { + for (int i = tid; i < cell_atom[cid]; i += blockDim.x) { int pid = pbase + i; float3 p = cell_list[pid]; p.x = fmaxf(p.x, d_boxlo[0]-cSize); @@ -136,25 +142,30 @@ __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int nc unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x; if (gid < ncell) { - if (cell_atom[gid] > CELL_SIZE) + if (cell_atom[gid] > blockDim.x) *overflow = 1; } } __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos) { + float cSize = d_cell_size[0]; + int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize); + int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize); + int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize); + // calculate 3D block idx from 2d block int bx = blockIdx.x; - int by = blockIdx.y % gridDim.x; - int bz = blockIdx.y / gridDim.x; + int by = blockIdx.y % ncelly; + int bz = blockIdx.y / ncelly; int tid = threadIdx.x; // compute cell idx from 3D block idx - int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x); - int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list + int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly)); + int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list - for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) { + for (int i = tid; i < cell_atom[cid]; i += blockDim.x) { int pid = pbase + i; cell_list[pid] = pos[cell_idx[pid]]; } @@ -164,17 +175,7 @@ __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit); -void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size) -{ - int i = 0; - const unsigned int bitSize = sizeof(unsigned int)*8; - const unsigned int bitStep = 4; - const int gSize = size/BLOCK_1D; - while (bitSize > i*bitStep) { - radixSortBlocks<<>>(keys, values1, values2, bitStep, i*bitStep); - i++; - } -} + #ifdef __DEVICE_EMULATION__ #define __SYNC __syncthreads(); @@ -253,7 +254,7 @@ __device__ unsigned int rank(unsigned int preds) unsigned int address = scan(preds); __shared__ unsigned int numtrue; - if (threadIdx.x == BLOCK_1D - 1) + if (threadIdx.x == blockDim.x - 1) { numtrue = address + preds; } @@ -266,11 +267,12 @@ __device__ unsigned int rank(unsigned int preds) return rank; } +template __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit) { extern __shared__ unsigned int sMem1[]; - __shared__ float sMem2[BLOCK_1D]; - __shared__ int sMem3[BLOCK_1D]; + __shared__ float sMem2[blockSize]; + __shared__ int sMem3[blockSize]; int tid = threadIdx.x; @@ -314,7 +316,11 @@ __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, u } -__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit) +__global__ void radixSortBlocks(unsigned int *keys, + float3 *values1, + int *values2, + unsigned int nbits, + unsigned int startbit) { extern __shared__ unsigned int sMem[]; @@ -328,13 +334,30 @@ __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values value2 = values2[gid]; __syncthreads(); - radixSortBlock(&key, &value1, &value2, nbits, startbit); + if (blockDim.x == 64) + radixSortBlock<64>(&key, &value1, &value2, nbits, startbit); + else if (blockDim.x == 128) + radixSortBlock<128>(&key, &value1, &value2, nbits, startbit); + else if (blockDim.x == 256) + radixSortBlock<256>(&key, &value1, &value2, nbits, startbit); keys[gid] = key; values1[gid] = value1; values2[gid] = value2; } +void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size) +{ + int i = 0; + const unsigned int bitSize = sizeof(unsigned int)*8; + const unsigned int bitStep = 4; + const int gSize = size/cell_size; + while (bitSize > i*bitStep) { + radixSortBlocks<<>>(keys, values1, values2, bitStep, i*bitStep); + i++; + } +} + static float3 *d_pos, *pos_temp; static int *d_type; static int *d_overflow, *d_rebuild; @@ -376,9 +399,12 @@ void clear_cell_list(cell_list &cell_list_gpu) void build_cell_list(double *atom_pos, int *atom_type, cell_list &cell_list_gpu, - const int ncell, const int ncell1D, const int buffer, - const int inum, const int nall, const int ago) + const int ncell, const int ncellx, const int ncelly, const int ncellz, + const int buffer, const int inum, const int nall, const int ago) { + + cudaError_t err; + cudaMemset(d_overflow, 0, sizeof(int)); cudaMemset(d_rebuild, 0, sizeof(int)); @@ -394,28 +420,24 @@ void build_cell_list(double *atom_pos, int *atom_type, // copy the last built cell-list and test whether it needs to be rebuilt if (!first_build) { - dim3 block(BLOCK_1D); - dim3 grid(ncell1D, ncell1D*ncell1D); - kernel_copy_list<<>>(cell_list_gpu.pos, + + dim3 grid(ncellx, ncelly*ncellz); + kernel_copy_list<<>>(cell_list_gpu.pos, cell_list_gpu.idx, cell_list_gpu.natom, d_pos); cudaMemset(d_rebuild, 0, sizeof(int)); int *temp = (int*)malloc(sizeof(int)*ncell); - kernel_test_rebuild<<>>(cell_list_gpu.pos, + kernel_test_rebuild<<>>(cell_list_gpu.pos, cell_list_gpu.natom, d_rebuild); cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost); + + err = cudaGetLastError(); + assert(err == cudaSuccess); } - /*if (!first_build) { - dim3 block(BLOCK_1D); - dim3 grid(ncell1D, ncell1D*ncell1D); - kernel_copy_list<<>>(cell_list_gpu.pos, - cell_list_gpu.idx, - cell_list_gpu.natom, d_pos); - }*/ if (ago == 0) rebuild = 1; - + // build cell-list for the first time if (first_build || rebuild) { first_build = 0; @@ -431,24 +453,32 @@ void build_cell_list(double *atom_pos, int *atom_type, cell_list_gpu.idx, cell_list_gpu.type, cell_list_gpu.natom, - d_pos, d_type, inum, nall); - + d_pos, d_type, inum, nall, buffer); + err = cudaGetLastError(); + assert(err == cudaSuccess); // check cell list overflow - int overflow; - int gDimCell = static_cast(ceil(static_cast(ncell)/BLOCK_1D)); - kernel_test_overflow<<>>(cell_list_gpu.natom, - d_overflow, ncell); + int overflow = 0; + int gDimCell = static_cast(ceil(static_cast(ncell)/buffer)); + kernel_test_overflow<<>>(cell_list_gpu.natom, + d_overflow, ncell); cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); + if (overflow > 0) { - printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n"); + printf("\n BLOCK_1D too small for cell list, please increase it!"); + printf("\n BLOCK_1D = %d",BLOCK_1D); + printf("\n ncell = %d",ncell); + printf("\n gDimCell = %d",gDimCell); + printf("\n overflow = %d \n",overflow); exit(0); } - + // sort atoms in every cell by atom index to avoid floating point associativity problem. sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos, - cell_list_gpu.type, ncell*buffer); + cell_list_gpu.type, ncell*buffer, buffer); cudaThreadSynchronize(); + err = cudaGetLastError(); + assert(err == cudaSuccess); } } diff --git a/lib/gpu/pair_gpu_cell.h b/lib/gpu/pair_gpu_cell.h index 072f5ffbd0..48dab9adb0 100644 --- a/lib/gpu/pair_gpu_cell.h +++ b/lib/gpu/pair_gpu_cell.h @@ -20,6 +20,22 @@ #ifndef PAIR_GPU_CELL_H #define PAIR_GPU_CELL_H +#ifdef WINDLL +#include +#endif + +#ifdef WINDLL +#define EXTERN extern "C" __declspec(dllexport) +#else +#define EXTERN +#endif +using namespace std; + +static float kernelTime = 0.0; +static int ncellx, ncelly, ncellz; +static float *energy, *d_energy; +static float3 *d_force, *f_temp, *v_temp, *d_virial; + typedef struct { float3 *pos; @@ -28,6 +44,8 @@ typedef struct { int *natom; } cell_list; +static cell_list cell_list_gpu; + __global__ void kernel_set_cell_list(unsigned int *cell_idx); __global__ void kernel_build_cell_list(float3 *cell_list, unsigned int *cell_idx, @@ -54,8 +72,8 @@ void init_cell_list(cell_list &cell_list_gpu, void build_cell_list(double *atom_pos, int *atom_type, cell_list &cell_list_gpu, - const int ncell, const int ncell1D, const int buffer, - const int inum, const int nall, const int ago); + const int ncell, const int ncellx, const int ncelly, const int ncellz, + const int buffer, const int inum, const int nall, const int ago); void clear_cell_list(cell_list &cell_list_gpu); diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h index f7eb376ea6..6d19972530 100644 --- a/lib/gpu/pair_gpu_nbor.h +++ b/lib/gpu/pair_gpu_nbor.h @@ -24,7 +24,6 @@ #include "nvc_timer.h" #include "nvc_memory.h" -#define BLOCK_1D 64 #define IJ_SIZE 131072 class PairGPUNbor {