mirror of https://github.com/lammps/lammps.git
Improvements in lib/gpu
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3804 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
8eaf196813
commit
71ed5c03ec
|
@ -27,29 +27,12 @@
|
|||
#include "lj_gpu_memory.cu"
|
||||
#include "lj_gpu_kernel.h"
|
||||
|
||||
#ifdef WINDLL
|
||||
#include <windows.h>
|
||||
BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved)
|
||||
{
|
||||
return TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WINDLL
|
||||
#define EXTERN extern "C" __declspec(dllexport)
|
||||
#else
|
||||
#define EXTERN
|
||||
#endif
|
||||
using namespace std;
|
||||
|
||||
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
|
||||
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
|
||||
|
||||
static float kernelTime = 0.0;
|
||||
static int ncell1D;
|
||||
static float *energy, *d_energy;
|
||||
static float3 *d_force, *f_temp, *v_temp, *d_virial;
|
||||
static cell_list cell_list_gpu;
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Convert something to a string
|
||||
|
@ -93,7 +76,9 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
|
|||
bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
|
||||
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
|
||||
|
||||
ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
|
||||
ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
|
||||
ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
|
||||
ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
|
||||
|
||||
init_cell_list_const(cell_size, skin, boxlo, boxhi);
|
||||
|
||||
|
@ -153,6 +138,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
|||
const int nall, const int ago, const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
{
|
||||
cudaError_t err;
|
||||
|
||||
ljm.atom.nall(nall);
|
||||
ljm.atom.inum(inum);
|
||||
|
||||
|
@ -161,8 +148,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
|||
|
||||
double evdwl=0.0;
|
||||
|
||||
static int buffer = CELL_SIZE;
|
||||
static int ncell = (int)pow((float)ncell1D,3);
|
||||
static int blockSize = BLOCK_1D;
|
||||
static int ncell = ncellx*ncelly*ncellz;
|
||||
|
||||
static int first_call = 1;
|
||||
|
||||
|
@ -176,7 +163,7 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
|||
cudaMalloc((void**)&d_energy, inum*sizeof(float));
|
||||
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
|
||||
|
||||
init_cell_list(cell_list_gpu, nall, ncell, buffer);
|
||||
init_cell_list(cell_list_gpu, nall, ncell, blockSize);
|
||||
|
||||
first_call = 0;
|
||||
}
|
||||
|
@ -198,13 +185,13 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
|||
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
|
||||
|
||||
clear_cell_list(cell_list_gpu);
|
||||
init_cell_list(cell_list_gpu, nall, ncell, buffer);
|
||||
init_cell_list(cell_list_gpu, nall, ncell, blockSize);
|
||||
}
|
||||
|
||||
// build cell-list on GPU
|
||||
ljm.atom.time_atom.start();
|
||||
build_cell_list(host_x[0], host_type, cell_list_gpu,
|
||||
ncell, ncell1D, buffer, inum, nall, ago);
|
||||
ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago);
|
||||
ljm.atom.time_atom.stop();
|
||||
|
||||
ljm.time_pair.start();
|
||||
|
@ -216,25 +203,32 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
|
|||
cudaEventRecord(start, 0);
|
||||
#endif
|
||||
|
||||
#define KERNEL_LJ_CELL(e, v, b, s) kernel_lj_cell<e,v,b><<<GX, BX, s>>> \
|
||||
(d_force, d_energy, d_virial, \
|
||||
cell_list_gpu.pos, \
|
||||
cell_list_gpu.idx, \
|
||||
cell_list_gpu.type, \
|
||||
cell_list_gpu.natom, \
|
||||
inum, nall, ncell, ncellx, ncelly, ncellz);
|
||||
|
||||
// call the cell-list force kernel
|
||||
const int BX=BLOCK_1D;
|
||||
dim3 GX(ncell1D, ncell1D*ncell1D);
|
||||
const int BX=blockSize;
|
||||
dim3 GX(ncellx, ncelly*ncellz);
|
||||
|
||||
if (eflag == 0 && vflag == 0) {
|
||||
kernel_lj_cell<false,false><<<GX, BX, 0>>>
|
||||
(d_force, d_energy, d_virial,
|
||||
cell_list_gpu.pos,
|
||||
cell_list_gpu.idx,
|
||||
cell_list_gpu.type,
|
||||
cell_list_gpu.natom,
|
||||
inum, nall, ncell);
|
||||
if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64, 0);
|
||||
if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0);
|
||||
if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0);
|
||||
} else {
|
||||
kernel_lj_cell<true,true><<<GX, BX, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES>>>
|
||||
(d_force, d_energy, d_virial,
|
||||
cell_list_gpu.pos,
|
||||
cell_list_gpu.idx,
|
||||
cell_list_gpu.type,
|
||||
cell_list_gpu.natom,
|
||||
inum, nall, ncell);
|
||||
if (blockSize == 64) KERNEL_LJ_CELL(true, true, 64, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
|
||||
if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
|
||||
if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
|
||||
}
|
||||
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("LJ force kernel launch error: %d\n", err);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#ifdef TIMING
|
||||
|
@ -296,122 +290,6 @@ EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *
|
|||
ago, eflag, vflag, boxlo, boxhi);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double _lj_gpu_n2(LJMT &ljm, double **force, double *virial,
|
||||
double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
{
|
||||
ljm.atom.nall(nall);
|
||||
ljm.atom.inum(inum);
|
||||
|
||||
|
||||
ljm.nbor.time_nbor.start();
|
||||
ljm.nbor.time_nbor.stop();
|
||||
|
||||
|
||||
double evdwl=0.0;
|
||||
|
||||
#ifdef NOUSE
|
||||
static int first_call = 1;
|
||||
|
||||
if (first_call) {
|
||||
energy = (float*) malloc(inum*sizeof(float));
|
||||
v_temp = (float3*) malloc(inum*2*sizeof(float3));
|
||||
cudaMallocHost((void**)&f_temp, inum*sizeof(float3));
|
||||
cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
|
||||
cudaMalloc((void**)&d_force, inum*sizeof(float3));
|
||||
cudaMalloc((void**)&d_energy, inum*sizeof(float));
|
||||
cudaMalloc((void**)&d_virial, inum*3*sizeof(float3));
|
||||
cudaMalloc((void**)&d_pos, nall*sizeof(float3));
|
||||
cudaMalloc((void**)&d_type, nall*sizeof(int));
|
||||
first_call = 0;
|
||||
}
|
||||
|
||||
|
||||
ljm.atom.time_atom.start();
|
||||
double *atom_pos = host_x[0];
|
||||
for (int i = 0; i < 3*nall; i+=3) {
|
||||
pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
|
||||
}
|
||||
cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_type, host_type, nall*sizeof(int), cudaMemcpyHostToDevice);
|
||||
|
||||
ljm.atom.time_atom.stop();
|
||||
|
||||
ljm.time_pair.start();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=BLOCK_1D;
|
||||
dim3 GX(static_cast<int>(ceil(static_cast<double>(inum)/BX)));
|
||||
|
||||
#ifdef TIMING
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
cudaEventRecord(start, 0);
|
||||
#endif
|
||||
|
||||
// N^2 force kernel
|
||||
kernel_lj_n2<numtyp, acctyp><<<GX, BX>>>(d_force, d_energy, d_virial,
|
||||
d_pos, d_type, eflag, vflag, inum, nall);
|
||||
|
||||
#ifdef TIMING
|
||||
cudaEventRecord(stop, 0);
|
||||
cudaEventSynchronize(stop);
|
||||
float kTime;
|
||||
cudaEventElapsedTime(&kTime, start, stop);
|
||||
kernelTime += kTime;
|
||||
printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
|
||||
cudaEventDestroy(start);
|
||||
cudaEventDestroy(stop);
|
||||
#endif
|
||||
|
||||
// copy results from GPU to CPU
|
||||
cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
|
||||
if (eflag) {
|
||||
cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
|
||||
for (int i = 0; i < inum; i++) {
|
||||
evdwl += energy[i];
|
||||
}
|
||||
evdwl *= 0.5f;
|
||||
}
|
||||
if (vflag) {
|
||||
cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
|
||||
for (int i = 0; i < inum; i++) {
|
||||
virial[0] += v_temp[2*i].x;
|
||||
virial[1] += v_temp[2*i].y;
|
||||
virial[2] += v_temp[2*i].z;
|
||||
virial[3] += v_temp[2*i+1].x;
|
||||
virial[4] += v_temp[2*i+1].y;
|
||||
virial[5] += v_temp[2*i+1].z;
|
||||
}
|
||||
for (int i = 0; i < 6; i++)
|
||||
virial[i] *= 0.5f;
|
||||
}
|
||||
|
||||
for (int i = 0; i < inum; i++) {
|
||||
force[i][0] += f_temp[i].x;
|
||||
force[i][1] += f_temp[i].y;
|
||||
force[i][2] += f_temp[i].z;
|
||||
}
|
||||
#endif
|
||||
ljm.time_pair.stop();
|
||||
|
||||
ljm.atom.time_atom.add_to_total();
|
||||
ljm.nbor.time_nbor.add_to_total();
|
||||
ljm.time_pair.add_to_total();
|
||||
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall,
|
||||
const bool eflag, const bool vflag,
|
||||
const double *boxlo, const double *boxhi)
|
||||
{
|
||||
return _lj_gpu_n2<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall,
|
||||
eflag, vflag, boxlo, boxhi);
|
||||
}
|
||||
|
||||
EXTERN void lj_gpu_time() {
|
||||
cout.precision(4);
|
||||
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";
|
||||
|
|
|
@ -21,25 +21,29 @@
|
|||
#define LJ_GPU_KERNEL
|
||||
|
||||
/* Cell list version of LJ kernel */
|
||||
template<bool eflag, bool vflag>
|
||||
template<bool eflag, bool vflag, int blockSize>
|
||||
__global__ void kernel_lj_cell(float3 *force3,
|
||||
float *energy, float3 *virial,
|
||||
float3 *cell_list, unsigned int *cell_idx,
|
||||
int *cell_type, int *cell_atom,
|
||||
const int inum, const int nall, const int ncell)
|
||||
const int inum, const int nall, const int ncell,
|
||||
const int ncellx, const int ncelly, const int ncellz)
|
||||
{
|
||||
|
||||
|
||||
|
||||
// calculate 3D block idx from 2d block
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y % gridDim.x;
|
||||
int bz = blockIdx.y / gridDim.x;
|
||||
int by = blockIdx.y % ncelly;
|
||||
int bz = blockIdx.y / ncelly;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// compute cell idx from 3D block idx
|
||||
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
|
||||
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
|
||||
|
||||
__shared__ int typeSh[CELL_SIZE];
|
||||
__shared__ float posSh[CELL_SIZE*3];
|
||||
__shared__ int typeSh[blockSize];
|
||||
__shared__ float posSh[blockSize*3];
|
||||
__shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
|
@ -51,7 +55,7 @@ __global__ void kernel_lj_cell(float3 *force3,
|
|||
__shared__ float *offsetSh;
|
||||
|
||||
// load force parameters into shared memory
|
||||
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
|
||||
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
|
||||
int itype = i/MAX_SHARED_TYPES;
|
||||
int jtype = i%MAX_SHARED_TYPES;
|
||||
cutsqSh[i] = _cutsq_<float>(itype,jtype);
|
||||
|
@ -65,7 +69,7 @@ __global__ void kernel_lj_cell(float3 *force3,
|
|||
lj3Sh = smem;
|
||||
lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
|
||||
offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
|
||||
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
|
||||
for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
|
||||
int itype = i/MAX_SHARED_TYPES;
|
||||
int jtype = i%MAX_SHARED_TYPES;
|
||||
lj3Sh[i] = _lj3_<float>(itype,jtype).x+0.01;
|
||||
|
@ -76,41 +80,41 @@ __global__ void kernel_lj_cell(float3 *force3,
|
|||
|
||||
__syncthreads();
|
||||
|
||||
int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1),
|
||||
nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1),
|
||||
nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1);
|
||||
int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1),
|
||||
nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1),
|
||||
nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1);
|
||||
|
||||
for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) {
|
||||
for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) {
|
||||
float3 f = {0.0f, 0.0f, 0.0f};
|
||||
float ener = 0.0f;
|
||||
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
|
||||
int itype;
|
||||
float ix, iy, iz;
|
||||
int i = tid + ii*BLOCK_1D;
|
||||
unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i];
|
||||
int i = tid + ii*blockSize;
|
||||
unsigned int answer_pos = cell_idx[cid*blockSize+i];
|
||||
|
||||
// load current cell atom position and type into sMem
|
||||
for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) {
|
||||
int pid = cid*CELL_SIZE + j;
|
||||
for (int j = tid; j < cell_atom[cid]; j += blockSize) {
|
||||
int pid = cid*blockSize + j;
|
||||
float3 pos = cell_list[pid];
|
||||
posSh[j ] = pos.x;
|
||||
posSh[j+ CELL_SIZE] = pos.y;
|
||||
posSh[j+2*CELL_SIZE] = pos.z;
|
||||
posSh[j+ blockSize] = pos.y;
|
||||
posSh[j+2*blockSize] = pos.z;
|
||||
typeSh[j] = cell_type[pid];
|
||||
}
|
||||
__syncthreads();
|
||||
if (answer_pos < inum) {
|
||||
itype = typeSh[i];
|
||||
ix = posSh[i ];
|
||||
iy = posSh[i+ CELL_SIZE];
|
||||
iz = posSh[i+2*CELL_SIZE];
|
||||
iy = posSh[i+ blockSize];
|
||||
iz = posSh[i+2*blockSize];
|
||||
|
||||
// compute force from current cell
|
||||
for (int j = 0; j < cell_atom[cid]; j++) {
|
||||
if (j == i) continue;
|
||||
float delx = ix - posSh[j ];
|
||||
float dely = iy - posSh[j+ CELL_SIZE];
|
||||
float delz = iz - posSh[j+2*CELL_SIZE];
|
||||
float dely = iy - posSh[j+ blockSize];
|
||||
float delz = iz - posSh[j+2*blockSize];
|
||||
int jtype = typeSh[j];
|
||||
int mtype = itype + jtype*MAX_SHARED_TYPES;
|
||||
float r2inv = delx*delx + dely*dely + delz*delz;
|
||||
|
@ -149,16 +153,16 @@ __global__ void kernel_lj_cell(float3 *force3,
|
|||
if (nborz == bz && nbory == by && nborx == bx) continue;
|
||||
|
||||
// compute cell id
|
||||
int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) +
|
||||
INT_MUL(nborz,gridDim.x*gridDim.x);
|
||||
int cid_nbor = nborx + INT_MUL(nbory,ncellx) +
|
||||
INT_MUL(nborz,INT_MUL(ncellx,ncelly));
|
||||
|
||||
// load neighbor cell position and type into smem
|
||||
for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) {
|
||||
int pid = INT_MUL(cid_nbor,CELL_SIZE) + j;
|
||||
for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) {
|
||||
int pid = INT_MUL(cid_nbor,blockSize) + j;
|
||||
float3 pos = cell_list[pid];
|
||||
posSh[j ] = pos.x;
|
||||
posSh[j+ CELL_SIZE] = pos.y;
|
||||
posSh[j+2*CELL_SIZE] = pos.z;
|
||||
posSh[j+ blockSize] = pos.y;
|
||||
posSh[j+2*blockSize] = pos.z;
|
||||
typeSh[j] = cell_type[pid];
|
||||
}
|
||||
__syncthreads();
|
||||
|
@ -166,8 +170,8 @@ __global__ void kernel_lj_cell(float3 *force3,
|
|||
if (answer_pos < inum) {
|
||||
for (int j = 0; j < cell_atom[cid_nbor]; j++) {
|
||||
float delx = ix - posSh[j ];
|
||||
float dely = iy - posSh[j+ CELL_SIZE];
|
||||
float delz = iz - posSh[j+2*CELL_SIZE];
|
||||
float dely = iy - posSh[j+ blockSize];
|
||||
float delz = iz - posSh[j+2*blockSize];
|
||||
int jtype = typeSh[j];
|
||||
int mtype = itype + jtype*MAX_SHARED_TYPES;
|
||||
float r2inv = delx*delx + dely*dely + delz*delz;
|
||||
|
@ -438,116 +442,4 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
|||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
/* Brute force O(N^2) version of LJ kernel */
|
||||
template<class numtyp, class acctyp>
|
||||
__global__ void kernel_lj_n2(float3 *force3,
|
||||
float *energy, float3 *virial,
|
||||
float3 *pos, int *type,
|
||||
const bool eflag, const bool vflag, const int inum, const int nall)
|
||||
{
|
||||
int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x);
|
||||
int tid = threadIdx.x;
|
||||
__shared__ float posSh[BLOCK_1D*3];
|
||||
__shared__ int typeSh[BLOCK_1D];
|
||||
__shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
int itype=tid/MAX_SHARED_TYPES;
|
||||
int jtype=tid%MAX_SHARED_TYPES;
|
||||
cutsqSh[tid]=_cutsq_<numtyp>(itype,jtype);
|
||||
lj1Sh[tid]=_lj1_<numtyp>(itype,jtype).x;
|
||||
lj2Sh[tid]=_lj1_<numtyp>(itype,jtype).y;
|
||||
lj3Sh[tid]=_lj3_<numtyp>(itype,jtype).x;
|
||||
lj4Sh[tid]=_lj3_<numtyp>(itype,jtype).y;
|
||||
offsetSh[tid]=_offset_<numtyp>(itype,jtype);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float3 f = {0.0f, 0.0f, 0.0f};
|
||||
float ener = 0.0f;
|
||||
float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
|
||||
|
||||
int itype, jtype;
|
||||
int mtype;
|
||||
|
||||
numtyp ix, iy, iz;
|
||||
|
||||
if (gid < inum) {
|
||||
ix = pos[gid].x;
|
||||
iy = pos[gid].y;
|
||||
iz = pos[gid].z;
|
||||
itype = type[gid];
|
||||
}
|
||||
|
||||
int pid = tid;
|
||||
int nIter = ceil((float)nall/BLOCK_1D);
|
||||
for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) {
|
||||
|
||||
if (pid < nall) {
|
||||
posSh[tid ] = pos[pid].x;
|
||||
posSh[tid+ BLOCK_1D] = pos[pid].y;
|
||||
posSh[tid+2*BLOCK_1D] = pos[pid].z;
|
||||
typeSh[tid] = type[pid];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (gid < inum) {
|
||||
int pid_j = jj*BLOCK_1D;
|
||||
|
||||
for (int j = 0; j < BLOCK_1D; j++, pid_j++) {
|
||||
if (jj == blockIdx.x && tid == j) continue;
|
||||
if (pid_j < nall) {
|
||||
numtyp delx = ix - posSh[j ];
|
||||
numtyp dely = iy - posSh[j+ BLOCK_1D];
|
||||
numtyp delz = iz - posSh[j+2*BLOCK_1D];
|
||||
jtype = typeSh[j];
|
||||
mtype = itype + jtype*MAX_SHARED_TYPES;
|
||||
numtyp r2inv = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if (r2inv < cutsqSh[mtype]) {
|
||||
r2inv = (numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv * r2inv * r2inv;
|
||||
numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
|
||||
f.x += delx * force;
|
||||
f.y += dely * force;
|
||||
f.z += delz * force;
|
||||
|
||||
if (eflag) {
|
||||
numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
|
||||
ener +=(e-offsetSh[mtype]);
|
||||
}
|
||||
if (vflag) {
|
||||
v0.x += delx*delx*force;
|
||||
v0.y += dely*dely*force;
|
||||
v0.z += delz*delz*force;
|
||||
v1.x += delx*dely*force;
|
||||
v1.y += delx*delz*force;
|
||||
v1.z += dely*delz*force;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (gid < inum) {
|
||||
if (eflag)
|
||||
energy[gid] = ener;
|
||||
if (vflag) {
|
||||
virial[2*gid ] = v0;
|
||||
virial[2*gid+1] = v1;
|
||||
}
|
||||
force3[gid] = f;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -25,8 +25,8 @@
|
|||
#include "pair_gpu_atom.h"
|
||||
#include "pair_gpu_nbor.h"
|
||||
|
||||
#define BLOCK_1D 64
|
||||
#define CELL_SIZE 64
|
||||
#define BLOCK_1D 64 // max value = 256
|
||||
#define CELL_SIZE BLOCK_1D
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#define PERCENT_GPU_MEMORY 0.7
|
||||
#define BIG_NUMBER 100000000
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <assert.h>
|
||||
#include "lj_gpu_memory.h"
|
||||
#include "pair_gpu_cell.h"
|
||||
|
||||
|
@ -60,11 +61,14 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
|
|||
float3 *pos,
|
||||
int *type,
|
||||
const int inum,
|
||||
const int nall)
|
||||
const int nall,
|
||||
const int cell_size)
|
||||
{
|
||||
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
|
||||
float cSize = d_cell_size[0];
|
||||
int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
||||
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
||||
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
|
||||
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
|
||||
|
||||
if (gid < nall) {
|
||||
float3 p = pos[gid];
|
||||
|
@ -75,11 +79,11 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
|
|||
p.z = fmaxf(p.z, d_boxlo[2]-cSize);
|
||||
p.z = fminf(p.z, d_boxhi[2]+cSize);
|
||||
|
||||
int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D
|
||||
+ (int)(p.z/cSize + 1.0) * ncell1D * ncell1D;
|
||||
int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx
|
||||
+ (int)(p.z/cSize + 1.0) * ncellx * ncelly;
|
||||
|
||||
int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
|
||||
int pid = cell_id*CELL_SIZE + atom_pos;
|
||||
int pid = cell_id*cell_size + atom_pos;
|
||||
|
||||
cell_list[pid] = pos[gid];
|
||||
cell_type[pid] = type[gid];
|
||||
|
@ -92,18 +96,20 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
|
|||
{
|
||||
|
||||
float cSize = d_cell_size[0];
|
||||
int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
||||
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
||||
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
|
||||
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
|
||||
|
||||
// calculate 3D block idx from 2d block
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y % gridDim.x;
|
||||
int bz = blockIdx.y / gridDim.x;
|
||||
int by = blockIdx.y % ncelly;
|
||||
int bz = blockIdx.y / ncelly;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// compute cell idx from 3D block idx
|
||||
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
|
||||
int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
|
||||
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
|
||||
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
|
||||
|
||||
float skin = d_skin[0];
|
||||
float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
|
||||
|
@ -113,7 +119,7 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
|
|||
float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
|
||||
float hiz = lowz + cSize + skin;
|
||||
|
||||
for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
|
||||
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
|
||||
int pid = pbase + i;
|
||||
float3 p = cell_list[pid];
|
||||
p.x = fmaxf(p.x, d_boxlo[0]-cSize);
|
||||
|
@ -136,25 +142,30 @@ __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int nc
|
|||
unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
|
||||
|
||||
if (gid < ncell) {
|
||||
if (cell_atom[gid] > CELL_SIZE)
|
||||
if (cell_atom[gid] > blockDim.x)
|
||||
*overflow = 1;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
|
||||
{
|
||||
float cSize = d_cell_size[0];
|
||||
int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
|
||||
int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
|
||||
int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
|
||||
|
||||
// calculate 3D block idx from 2d block
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y % gridDim.x;
|
||||
int bz = blockIdx.y / gridDim.x;
|
||||
int by = blockIdx.y % ncelly;
|
||||
int bz = blockIdx.y / ncelly;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
// compute cell idx from 3D block idx
|
||||
int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
|
||||
int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
|
||||
int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
|
||||
int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
|
||||
|
||||
for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
|
||||
for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
|
||||
int pid = pbase + i;
|
||||
cell_list[pid] = pos[cell_idx[pid]];
|
||||
}
|
||||
|
@ -164,17 +175,7 @@ __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int
|
|||
|
||||
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit);
|
||||
|
||||
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size)
|
||||
{
|
||||
int i = 0;
|
||||
const unsigned int bitSize = sizeof(unsigned int)*8;
|
||||
const unsigned int bitStep = 4;
|
||||
const int gSize = size/BLOCK_1D;
|
||||
while (bitSize > i*bitStep) {
|
||||
radixSortBlocks<<<gSize, BLOCK_1D, 2*BLOCK_1D*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
#define __SYNC __syncthreads();
|
||||
|
@ -253,7 +254,7 @@ __device__ unsigned int rank(unsigned int preds)
|
|||
unsigned int address = scan(preds);
|
||||
|
||||
__shared__ unsigned int numtrue;
|
||||
if (threadIdx.x == BLOCK_1D - 1)
|
||||
if (threadIdx.x == blockDim.x - 1)
|
||||
{
|
||||
numtrue = address + preds;
|
||||
}
|
||||
|
@ -266,11 +267,12 @@ __device__ unsigned int rank(unsigned int preds)
|
|||
return rank;
|
||||
}
|
||||
|
||||
template<int blockSize>
|
||||
__device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
|
||||
{
|
||||
extern __shared__ unsigned int sMem1[];
|
||||
__shared__ float sMem2[BLOCK_1D];
|
||||
__shared__ int sMem3[BLOCK_1D];
|
||||
__shared__ float sMem2[blockSize];
|
||||
__shared__ int sMem3[blockSize];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
|
@ -314,7 +316,11 @@ __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, u
|
|||
|
||||
}
|
||||
|
||||
__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit)
|
||||
__global__ void radixSortBlocks(unsigned int *keys,
|
||||
float3 *values1,
|
||||
int *values2,
|
||||
unsigned int nbits,
|
||||
unsigned int startbit)
|
||||
{
|
||||
|
||||
extern __shared__ unsigned int sMem[];
|
||||
|
@ -328,13 +334,30 @@ __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values
|
|||
value2 = values2[gid];
|
||||
__syncthreads();
|
||||
|
||||
radixSortBlock(&key, &value1, &value2, nbits, startbit);
|
||||
if (blockDim.x == 64)
|
||||
radixSortBlock<64>(&key, &value1, &value2, nbits, startbit);
|
||||
else if (blockDim.x == 128)
|
||||
radixSortBlock<128>(&key, &value1, &value2, nbits, startbit);
|
||||
else if (blockDim.x == 256)
|
||||
radixSortBlock<256>(&key, &value1, &value2, nbits, startbit);
|
||||
|
||||
keys[gid] = key;
|
||||
values1[gid] = value1;
|
||||
values2[gid] = value2;
|
||||
}
|
||||
|
||||
void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size)
|
||||
{
|
||||
int i = 0;
|
||||
const unsigned int bitSize = sizeof(unsigned int)*8;
|
||||
const unsigned int bitStep = 4;
|
||||
const int gSize = size/cell_size;
|
||||
while (bitSize > i*bitStep) {
|
||||
radixSortBlocks<<<gSize, cell_size, 2*cell_size*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
static float3 *d_pos, *pos_temp;
|
||||
static int *d_type;
|
||||
static int *d_overflow, *d_rebuild;
|
||||
|
@ -376,9 +399,12 @@ void clear_cell_list(cell_list &cell_list_gpu)
|
|||
|
||||
void build_cell_list(double *atom_pos, int *atom_type,
|
||||
cell_list &cell_list_gpu,
|
||||
const int ncell, const int ncell1D, const int buffer,
|
||||
const int inum, const int nall, const int ago)
|
||||
const int ncell, const int ncellx, const int ncelly, const int ncellz,
|
||||
const int buffer, const int inum, const int nall, const int ago)
|
||||
{
|
||||
|
||||
cudaError_t err;
|
||||
|
||||
cudaMemset(d_overflow, 0, sizeof(int));
|
||||
cudaMemset(d_rebuild, 0, sizeof(int));
|
||||
|
||||
|
@ -394,26 +420,22 @@ void build_cell_list(double *atom_pos, int *atom_type,
|
|||
|
||||
// copy the last built cell-list and test whether it needs to be rebuilt
|
||||
if (!first_build) {
|
||||
dim3 block(BLOCK_1D);
|
||||
dim3 grid(ncell1D, ncell1D*ncell1D);
|
||||
kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos,
|
||||
|
||||
dim3 grid(ncellx, ncelly*ncellz);
|
||||
kernel_copy_list<<<grid, buffer>>>(cell_list_gpu.pos,
|
||||
cell_list_gpu.idx,
|
||||
cell_list_gpu.natom, d_pos);
|
||||
cudaMemset(d_rebuild, 0, sizeof(int));
|
||||
int *temp = (int*)malloc(sizeof(int)*ncell);
|
||||
kernel_test_rebuild<<<grid, block>>>(cell_list_gpu.pos,
|
||||
kernel_test_rebuild<<<grid, buffer>>>(cell_list_gpu.pos,
|
||||
cell_list_gpu.natom,
|
||||
d_rebuild);
|
||||
cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
err = cudaGetLastError();
|
||||
assert(err == cudaSuccess);
|
||||
}
|
||||
|
||||
/*if (!first_build) {
|
||||
dim3 block(BLOCK_1D);
|
||||
dim3 grid(ncell1D, ncell1D*ncell1D);
|
||||
kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos,
|
||||
cell_list_gpu.idx,
|
||||
cell_list_gpu.natom, d_pos);
|
||||
}*/
|
||||
if (ago == 0) rebuild = 1;
|
||||
|
||||
// build cell-list for the first time
|
||||
|
@ -431,24 +453,32 @@ void build_cell_list(double *atom_pos, int *atom_type,
|
|||
cell_list_gpu.idx,
|
||||
cell_list_gpu.type,
|
||||
cell_list_gpu.natom,
|
||||
d_pos, d_type, inum, nall);
|
||||
|
||||
d_pos, d_type, inum, nall, buffer);
|
||||
err = cudaGetLastError();
|
||||
assert(err == cudaSuccess);
|
||||
// check cell list overflow
|
||||
int overflow;
|
||||
int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/BLOCK_1D));
|
||||
kernel_test_overflow<<<gDimCell, BLOCK_1D>>>(cell_list_gpu.natom,
|
||||
int overflow = 0;
|
||||
int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/buffer));
|
||||
kernel_test_overflow<<<gDimCell, buffer>>>(cell_list_gpu.natom,
|
||||
d_overflow, ncell);
|
||||
cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
if (overflow > 0) {
|
||||
printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n");
|
||||
printf("\n BLOCK_1D too small for cell list, please increase it!");
|
||||
printf("\n BLOCK_1D = %d",BLOCK_1D);
|
||||
printf("\n ncell = %d",ncell);
|
||||
printf("\n gDimCell = %d",gDimCell);
|
||||
printf("\n overflow = %d \n",overflow);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// sort atoms in every cell by atom index to avoid floating point associativity problem.
|
||||
sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos,
|
||||
cell_list_gpu.type, ncell*buffer);
|
||||
cell_list_gpu.type, ncell*buffer, buffer);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
err = cudaGetLastError();
|
||||
assert(err == cudaSuccess);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,6 +20,22 @@
|
|||
#ifndef PAIR_GPU_CELL_H
|
||||
#define PAIR_GPU_CELL_H
|
||||
|
||||
#ifdef WINDLL
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef WINDLL
|
||||
#define EXTERN extern "C" __declspec(dllexport)
|
||||
#else
|
||||
#define EXTERN
|
||||
#endif
|
||||
using namespace std;
|
||||
|
||||
static float kernelTime = 0.0;
|
||||
static int ncellx, ncelly, ncellz;
|
||||
static float *energy, *d_energy;
|
||||
static float3 *d_force, *f_temp, *v_temp, *d_virial;
|
||||
|
||||
|
||||
typedef struct {
|
||||
float3 *pos;
|
||||
|
@ -28,6 +44,8 @@ typedef struct {
|
|||
int *natom;
|
||||
} cell_list;
|
||||
|
||||
static cell_list cell_list_gpu;
|
||||
|
||||
__global__ void kernel_set_cell_list(unsigned int *cell_idx);
|
||||
__global__ void kernel_build_cell_list(float3 *cell_list,
|
||||
unsigned int *cell_idx,
|
||||
|
@ -54,8 +72,8 @@ void init_cell_list(cell_list &cell_list_gpu,
|
|||
|
||||
void build_cell_list(double *atom_pos, int *atom_type,
|
||||
cell_list &cell_list_gpu,
|
||||
const int ncell, const int ncell1D, const int buffer,
|
||||
const int inum, const int nall, const int ago);
|
||||
const int ncell, const int ncellx, const int ncelly, const int ncellz,
|
||||
const int buffer, const int inum, const int nall, const int ago);
|
||||
|
||||
void clear_cell_list(cell_list &cell_list_gpu);
|
||||
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
#include "nvc_timer.h"
|
||||
#include "nvc_memory.h"
|
||||
|
||||
#define BLOCK_1D 64
|
||||
#define IJ_SIZE 131072
|
||||
|
||||
class PairGPUNbor {
|
||||
|
|
Loading…
Reference in New Issue