diff --git a/lib/gpu/README b/lib/gpu/README index 02859b17ad..d459a59c96 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -74,13 +74,3 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3, NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE package has been installed before installing the GPU package in LAMMPS. - GPU MEMORY - -Upon initialization of the gayberne/gpu pair style, the library will reserve -memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value -is limiting. The value of 70% can be changed by editing the -PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value -of 64K cannot be increased and is the maximum number of atoms allowed per -GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script -can help to increase maximum number of atoms per GPU for cards with -limited memory. diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu index 10c6557b31..f95e1cbd77 100644 --- a/lib/gpu/gb_gpu.cu +++ b/lib/gpu/gb_gpu.cu @@ -36,7 +36,7 @@ static GB_GPU_Memory GBMF[MAX_GPU_THREADS]; // -- Only pack neighbors within cutoff // --------------------------------------------------------------------------- template -__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, +__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { @@ -56,25 +56,24 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, int *nbor_newj=nbor; nbor+=nbor_pitch; - numtyp ix=_x_(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); + vec4 ix=x_[i]; + int itype=ix.w; int newj=0; for ( ; list=nall) j%=nall; - int jtype=_x_(j,7); + vec4 jx=x_[j]; + int jtype=jx.w; if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) { // Compute r12; - numtyp rsq=_x_(j,0)-ix; + numtyp rsq=jx.x-ix.x; rsq*=rsq; - numtyp t=_x_(j,1)-iy; + numtyp t=jx.y-ix.y; rsq+=t*t; - t=_x_(j,2)-iz; + t=jx.z-ix.z; rsq+=t*t; if (rsq< _cutsq_(itype,jtype)) { @@ -95,7 +94,7 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, // -- Fast version of routine that uses shared memory for LJ constants // --------------------------------------------------------------------------- template -__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, +__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { @@ -124,25 +123,25 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, int *nbor_newj=nbor; nbor+=nbor_pitch; - numtyp ix=_x_(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); + vec4 ix=x_[i]; + int itype=INT_MUL(MAX_SHARED_TYPES,ix.w); int newj=0; for ( ; list=nall) j%=nall; - int mtype=itype+_x_(j,7); + vec4 jx=x_[j]; + int jtype=jx.w; + int mtype=itype+jtype; if (form[mtype]>=form_low && form[mtype]<=form_high) { // Compute r12; - numtyp rsq=_x_(j,0)-ix; + numtyp rsq=jx.x-ix.x; rsq*=rsq; - numtyp t=_x_(j,1)-iy; + numtyp t=jx.y-ix.y; rsq+=t*t; - t=_x_(j,2)-iz; + t=jx.z-ix.z; rsq+=t*t; if (rsq void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high) { - if (gbm.shared_types) + if (gbm.shared_types) { kernel_pack_nbor_fast<<>> - (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, + ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), + gbm.atom.inum(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); - else + } else kernel_pack_nbor<<>> - (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, + ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), + gbm.atom.inum(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); } @@ -188,9 +189,7 @@ EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) { string sname=GBMF[0].gpu.name(id)+", "+ gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+ gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+ - gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+ - gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id), - max_nbors))+" Atoms"; + gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ"; strcpy(name,sname.c_str()); } @@ -203,6 +202,7 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, const int thread, const int gpu_id) { assert(thread inline void _gb_gpu_atom(PairGPUAtom &atom, double **host_x, - double **host_quat, const int *host_type, - const bool rebuild, cudaStream_t &stream) { + double **host_quat, const int *host_type, + const bool rebuild, cudaStream_t &stream) { atom.time_atom.start(); atom.reset_write_buffer(); // Rows 1-3 of dev_x are position; rows 4-7 are quaternion - atom.add_atom_data(host_x[0],3); - atom.add_atom_data(host_x[0]+1,3); - atom.add_atom_data(host_x[0]+2,3); - atom.add_atom_data(host_quat[0],4); - atom.add_atom_data(host_quat[0]+1,4); - atom.add_atom_data(host_quat[0]+2,4); - atom.add_atom_data(host_quat[0]+3,4); - - int csize=7; - - // If a rebuild occured, copy type data - if (rebuild) { - atom.add_atom_data(host_type); - csize++; - } - - atom.copy_atom_data(csize,stream); + atom.add_x_data(host_x,host_type); + atom.add_q_data(host_quat[0]); + + atom.copy_x_data(stream); + atom.copy_q_data(stream); atom.time_atom.stop(); } EXTERN void gb_gpu_atom(double **host_x, double **host_quat, - const int *host_type, const bool rebuild, const int thread) { + const int *host_type, const bool rebuild, + const int thread) { _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild, GBMF[thread].pair_stream); } @@ -269,46 +259,111 @@ template int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, const int inum, int *ilist, const int *numj, const int *type, bool &success) { - if (nall>gbm.max_atoms) { - success=false; - return 0; - } success=true; gbm.nbor.time_nbor.start(); + int mn=0; + for (int i=0; igbm.max_atoms) + gbm.resize_atom(nall,success); + if (nlocal>gbm.max_local || mn>gbm._max_nbors) + gbm.resize_local(nlocal,mn,success); + if (!success) + return false; + gbm.atom.nall(nall); gbm.atom.inum(inum); if (gbm.multiple_forms) { - int p=0, acc=0; - for (int i=0; i0) { + gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset, + hi,gbm.pair_stream); + gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+ + inum*2+offset, + hi,gbm.pair_stream); + } + gbm.nbor.ij_total=0; } - gbm.nbor.ij_total=0; - gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum); - gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+ - gbm.nbor.dev_nbor.row_size(), - gbm.nbor.dev_nbor.row_size(),2,inum, - gbm.pair_stream); } else { gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream); gbm.last_ellipse=inum; @@ -349,6 +404,14 @@ EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag); } + +template +void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) { + gbm.atom.time_answer.start(); + gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); + gbm.atom.time_answer.stop(); +} + // --------------------------------------------------------------------------- // Calculate energies, forces, and torques for all ij interactions // --------------------------------------------------------------------------- @@ -357,7 +420,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, const bool rebuild) { // Compute the block size and grid size to keep all cores busy const int BX=BLOCK_1D; - + int ans_pitch=6; + if (eflag) + ans_pitch++; + if (vflag) + ans_pitch+=6; + int GX=static_cast(ceil(static_cast(gbm.atom.inum())/BX)); if (gbm.multiple_forms) { @@ -371,9 +439,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, gbm.time_gayberne.start(); kernel_gayberne<<>> - (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), + ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), + gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), + gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.nall()); gbm.time_gayberne.stop(); @@ -393,14 +462,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, GX=static_cast(ceil(static_cast(gbm.atom.inum()- gbm.last_ellipse)/BX)); pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE, - ELLIPSE_SPHERE); + ELLIPSE_SPHERE); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); kernel_sphere_gb<<>> - (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), + ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), + gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), + gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne2.stop(); } else { @@ -419,16 +489,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, if (gbm.last_ellipse<<>> - (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), - gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(), - gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, - gbm.atom.inum(), gbm.atom.nall()); + ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(), + gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, + vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); else kernel_lj<<>> - (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), - gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), + ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(), + gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); } gbm.time_pair.stop(); @@ -437,19 +506,21 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); - gbm.time_gayberne.start(); + gbm.time_gayberne.start(); kernel_gayberne<<>> - (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), - gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), - gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), + ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), + gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), + gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), + gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, vflag, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne.stop(); } } EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, - const int thread) { + const int thread) { _gb_gpu_gayberne(GBMF[thread],eflag,vflag,rebuild); + _gb_gpu_enqueue(GBMF[thread],eflag,vflag); } // --------------------------------------------------------------------------- @@ -462,9 +533,6 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, double *virial) { double evdw; - gbm.atom.time_answer.start(); - gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); - gbm.atom.time_atom.add_to_total(); gbm.nbor.time_nbor.add_to_total(); gbm.time_kernel.add_to_total(); @@ -475,11 +543,19 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, gbm.time_pair.add_to_total(); } CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); - - evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); - gbm.atom.add_forces(ilist,f); - gbm.atom.add_torques(ilist,tor,gbm.last_ellipse); - gbm.atom.time_answer.stop(); + if (gbm.last_ellipse>gbm.atom.inum()) { + if (eflag || vflag) + evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial, + f,tor,gbm.atom.inum()); + else + gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum()); + } else { + if (eflag || vflag) + evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial, + f,tor,gbm.last_ellipse); + else + gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse); + } gbm.atom.time_answer.add_to_total(); return evdw; } @@ -516,3 +592,4 @@ EXTERN int gb_gpu_num_devices() { EXTERN double gb_gpu_bytes() { return GBMF[0].host_memory_usage(); } + diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h index 0e060a80a8..1e51389197 100644 --- a/lib/gpu/gb_gpu_extra.h +++ b/lib/gpu/gb_gpu_extra.h @@ -298,24 +298,22 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], ------------------------------------------------------------------------- */ template -static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, +static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif, + const int qi, numtyp mat[9]) { - numtyp qi3=_x_(qi,3); - numtyp qi4=_x_(qi,4); - numtyp qi5=_x_(qi,5); - numtyp qi6=_x_(qi,6); - - numtyp w2 = qi3*qi3; - numtyp i2 = qi4*qi4; - numtyp j2 = qi5*qi5; - numtyp k2 = qi6*qi6; - numtyp twoij = (numtyp)2.0*qi4*qi5; - numtyp twoik = (numtyp)2.0*qi4*qi6; - numtyp twojk = (numtyp)2.0*qi5*qi6; - numtyp twoiw = (numtyp)2.0*qi4*qi3; - numtyp twojw = (numtyp)2.0*qi5*qi3; - numtyp twokw = (numtyp)2.0*qi6*qi3; + vec4 q=qif[qi]; + + numtyp w2 = q.x*q.x; + numtyp i2 = q.y*q.y; + numtyp j2 = q.z*q.z; + numtyp k2 = q.w*q.w; + numtyp twoij = (numtyp)2.0*q.y*q.z; + numtyp twoik = (numtyp)2.0*q.y*q.w; + numtyp twojk = (numtyp)2.0*q.z*q.w; + numtyp twoiw = (numtyp)2.0*q.y*q.x; + numtyp twojw = (numtyp)2.0*q.z*q.x; + numtyp twokw = (numtyp)2.0*q.w*q.x; mat[0] = w2+i2-j2-k2; mat[3] = twoij-twokw; diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h index 110a41fecf..b3f8b3ccc0 100644 --- a/lib/gpu/gb_gpu_kernel.h +++ b/lib/gpu/gb_gpu_kernel.h @@ -92,16 +92,19 @@ static __inline__ __device__ void compute_eta_torque(numtyp m[9], m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; } +#include "gb_gpu_kernel.h" + template -__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, - const int *dev_nbor, const int nbor_pitch, +__global__ void kernel_gayberne(const vec4* x_, const vec4 *q, + const numtyp *gum, const numtyp *special_lj, + const int *dev_nbor, const size_t nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; - // ii indexes the two interacting particles in gi + // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; @@ -109,7 +112,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, __syncthreads(); if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); + vec4 ix=x_[i]; + int itype=ix.w; numtyp a1[9], b1[9], g1[9]; { numtyp t[9]; - gpu_quat_to_mat_trans(i,a1); + gpu_quat_to_mat_trans(q,i,a1); gpu_shape_times3(itype,a1,t); gpu_transpose_times3(a1,t,g1); gpu_well_times3(itype,a1,t); @@ -146,7 +147,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, numtyp factor_lj; for ( ; nbor(j,7); + vec4 jx=x_[j]; + int jtype=jx.w; // Compute r12 numtyp r12[3]; - r12[0] = _x_(j,0)-ix; - r12[1] = _x_(j,1)-iy; - r12[2] = _x_(j,2)-iz; + r12[0] = jx.x-ix.x; + r12[1] = jx.y-ix.y; + r12[2] = jx.z-ix.z; numtyp ir = gpu_dot3(r12,r12); ir = rsqrt(ir); numtyp r = (numtyp)1.0/ir; numtyp a2[9]; - gpu_quat_to_mat_trans(j,a2); + gpu_quat_to_mat_trans(q,j,a2); numtyp u_r, dUr[3], tUr[3], eta, teta[3]; { // Compute U_r, dUr, eta, and teta @@ -366,35 +368,37 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, } // for nbor // Store answers - acctyp *ap1=ans+ii; + acctyp *ap1=ans+ii*ans_pitch; if (eflag) { *ap1=energy; - ap1+=ans_pitch; + ap1++; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; - ap1+=ans_pitch; + ap1++; } } *ap1=fx; - ap1+=ans_pitch; + ap1++; *ap1=fy; - ap1+=ans_pitch; + ap1++; *ap1=fz; - ap1+=ans_pitch; + ap1++; *ap1=torx; - ap1+=ans_pitch; + ap1++; *ap1=tory; - ap1+=ans_pitch; + ap1++; *ap1=torz; } // if ii + } template -__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, - const int *dev_nbor, const int nbor_pitch, +__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q, + const numtyp *gum, const numtyp *special_lj, + const int *dev_nbor, const size_t nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, @@ -425,12 +429,10 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, nbor+=nbor_pitch; int numj=*nbor; nbor+=nbor_pitch; - const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj); + const int *nbor_end=nbor+nbor_pitch*numj; - numtyp ix=_x_(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); + vec4 ix=x_[i]; + int itype=ix.w; numtyp oner=_shape_(itype,0); numtyp one_well=_well_(itype,0); @@ -445,13 +447,14 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, factor_lj = sp_lj[j/nall]; j %= nall; } - int jtype=_x_(j,7); + vec4 jx=x_[j]; + int jtype=jx.w; // Compute r12 numtyp r12[3]; - r12[0] = _x_(j,0)-ix; - r12[1] = _x_(j,1)-iy; - r12[2] = _x_(j,2)-iz; + r12[0] = jx.x-ix.x; + r12[1] = jx.y-ix.y; + r12[2] = jx.z-ix.z; numtyp ir = gpu_dot3(r12,r12); ir = rsqrt(ir); @@ -463,7 +466,7 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, r12hat[2]=r12[2]*ir; numtyp a2[9]; - gpu_quat_to_mat_trans(j,a2); + gpu_quat_to_mat_trans(q,j,a2); numtyp u_r, dUr[3], eta; { // Compute U_r, dUr, eta, and teta @@ -611,28 +614,29 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, } // for nbor // Store answers - acctyp *ap1=ans+ii; + acctyp *ap1=ans+ii*ans_pitch; if (eflag) { *ap1=energy; - ap1+=ans_pitch; + ap1++; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; - ap1+=ans_pitch; + ap1++; } } *ap1=fx; - ap1+=ans_pitch; + ap1++; *ap1=fy; - ap1+=ans_pitch; + ap1++; *ap1=fz; } // if ii } template -__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, acctyp *ans, +__global__ void kernel_lj(const vec4 *x_, + const numtyp *special_lj, const int *dev_nbor, + const size_t nbor_pitch, const int *dev_ij, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall) { @@ -663,10 +667,8 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, const int *list=dev_ij+*nbor; const int *list_end=list+numj; - numtyp ix=_x_(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,7); + vec4 ix=x_[i]; + int itype=ix.w; numtyp factor_lj; for ( ; list(j,7); + vec4 jx=x_[j]; + int jtype=jx.w; // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; numtyp r2inv = delx*delx+dely*dely+delz*delz; if (r2inv<_cutsq_(itype,jtype) && @@ -716,29 +719,30 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, } // for nbor // Store answers - acctyp *ap1=ans+ii; + acctyp *ap1=ans+ii*ans_pitch; if (eflag) { *ap1+=energy; - ap1+=ans_pitch; + ap1++; } if (vflag) { for (int i=0; i<6; i++) { *ap1+=virial[i]; - ap1+=ans_pitch; + ap1++; } } *ap1+=fx; - ap1+=ans_pitch; + ap1++; *ap1+=fy; - ap1+=ans_pitch; + ap1++; *ap1+=fz; } // if ii } template -__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, +__global__ void kernel_lj_fast(const vec4 *x_, + const numtyp *special_lj, const int *dev_nbor, + const size_t nbor_pitch, const int *dev_ij, acctyp *ans, size_t ans_pitch,int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall){ @@ -788,10 +792,8 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, const int *list=dev_ij+*nbor; const int *list_end=list+numj; - numtyp ix=_x_(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,7)); + vec4 ix=x_[i]; + int itype=INT_MUL(MAX_SHARED_TYPES,ix.w); numtyp factor_lj; for ( ; list(j,7); + vec4 jx=x_[j]; + int mtype=itype+jx.w; // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; numtyp r2inv = delx*delx+dely*dely+delz*delz; if (r2invallocated) clear(); bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon, host_lj1, host_lj2, host_lj3, host_lj4, - host_offset, host_special_lj, max_nbors, me); + host_offset, host_special_lj, max_nbors, me, + nlocal, nall); if (!p) return false; @@ -96,11 +99,28 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes, multiple_forms=true; // Memory for ilist ordered by particle type - host_olist.safe_alloc_rw(this->max_atoms); - - return true; + return host_olist.alloc_rw(this->max_local); } - + +template +void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) { + this->max_atoms=static_cast(static_cast(nall)*1.10); + this->atom.resize(this->max_atoms, success); +} + +template +void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors, + bool &success) { + if (nlocal>this->max_local) { + this->max_local=static_cast(static_cast(nlocal)*1.10); + host_olist.clear(); + success=success && host_olist.alloc_rw(this->max_local); + } + if (max_nbors>_max_nbors) + _max_nbors=static_cast(static_cast(max_nbors)*1.10); + this->nbor.resize(this->max_local,_max_nbors,success); +} + template void GB_GPU_MemoryT::clear() { if (!this->allocated) diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 496ce2033d..63627065bf 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -37,8 +37,11 @@ class GB_GPU_Memory : public LJ_GPU_Memory { double **host_epsilon, double *host_lshape, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int max_nbors, const bool force_d, const int me); + const int max_nbors, const int nlocal, const int nall, + const bool force_d, const int me); + void resize_atom(const int nall, bool &success); + void resize_local(const int nlocal, const int max_nbors, bool &success); void clear(); double host_memory_usage(); @@ -61,6 +64,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory { bool multiple_forms; int **host_form; int last_ellipse; + int _max_nbors; private: }; diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu index e31321ad2e..e96d3efb4f 100644 --- a/lib/gpu/lj_gpu.cu +++ b/lib/gpu/lj_gpu.cu @@ -74,7 +74,8 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double ** ij_size=IJ_SIZE; bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, - host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id); + host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id, + 0,0); ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size); ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size); @@ -101,37 +102,6 @@ EXTERN void lj_gpu_clear() { } -// --------------------------------------------------------------------------- -// Calculate energies and forces for all ij interactions -// --------------------------------------------------------------------------- -template -void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){ - // Compute the block size and grid size to keep all cores busy - const int BX=BLOCK_1D; - - int GX=static_cast(ceil(static_cast(ljm.atom.inum())/BX)); - - ljm.time_pair.start(); - - if (ljm.shared_types) - kernel_lj_fast<<>> - (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), - ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), - ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, - vflag, ljm.atom.inum(), ljm.atom.nall()); - else - kernel_lj<<>> - (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), - ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), - ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, - vflag, ljm.atom.inum(), ljm.atom.nall()); - ljm.time_pair.stop(); -} - -EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) { - _lj_gpu(LJMF, eflag,vflag,rebuild); -} - template double _lj_gpu_cell(LJMT &ljm, double **force, double *virial, double **host_x, int *host_type, const int inum, diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h index f783616a83..b2f03cde2b 100644 --- a/lib/gpu/lj_gpu_kernel.h +++ b/lib/gpu/lj_gpu_kernel.h @@ -217,229 +217,4 @@ __global__ void kernel_lj_cell(float3 *force3, } - -/* Neigbhor list version of LJ kernel */ -template -__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, acctyp *ans, - size_t ans_pitch, const bool eflag, - const bool vflag, const int inum, const int nall) { - __shared__ numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - ii+=INT_MUL(blockIdx.x,blockDim.x); - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=_x_(i,3); - - numtyp factor_lj; - for ( ; list(j,3); - - // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); - numtyp r2inv = delx*delx+dely*dely+delz*delz; - - if (r2inv<_cutsq_(itype,jtype)) { - r2inv=(numtyp)1.0/r2inv; - numtyp r6inv =r2inv*r2inv*r2inv; - numtyp force =factor_lj*r2inv*r6inv*(_lj1_(itype,jtype).x*r6inv- - _lj1_(itype,jtype).y); - - fx+=delx*force; - fy+=dely*force; - fz+=delz*force; - - if (eflag) { - numtyp e=r6inv*(_lj3_(itype,jtype).x*r6inv- - _lj3_(itype,jtype).y); - energy+=factor_lj*(e-_offset_(1,1)); - } - if (vflag) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - - // Store answers - acctyp *ap1=ans+ii; - if (eflag) { - *ap1=energy; - ap1+=ans_pitch; - } - if (vflag) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1+=ans_pitch; - } - } - *ap1=fx; - ap1+=ans_pitch; - *ap1=fy; - ap1+=ans_pitch; - *ap1=fz; - - } // if ii -} - -template -__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, - const int *dev_ij, const int nbor_pitch, - acctyp *ans, size_t ans_pitch,const bool eflag, - const bool vflag, const int inum, - const int nall) { - - // ii indexes the two interacting particles in gi - int ii=threadIdx.x; - __shared__ numtyp sp_lj[4]; - __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii<4) - sp_lj[ii]=special_lj[ii]; - if (ii(itype,jtype); - lj1[ii]=_lj1_(itype,jtype).x; - lj2[ii]=_lj1_(itype,jtype).y; - if (eflag) { - lj3[ii]=_lj3_(itype,jtype).x; - lj4[ii]=_lj3_(itype,jtype).y; - offset[ii]=_offset_(itype,jtype); - } - } - ii+=INT_MUL(blockIdx.x,blockDim.x); - - if (ii(i,0); - numtyp iy=_x_(i,1); - numtyp iz=_x_(i,2); - int itype=INT_MUL(MAX_SHARED_TYPES,_x_(i,3)); - - numtyp factor_lj; - - for ( ; list(j,3); - - // Compute r12 - numtyp delx = ix-_x_(j,0); - numtyp dely = iy-_x_(j,1); - numtyp delz = iz-_x_(j,2); - numtyp r2inv = delx*delx+dely*dely+delz*delz; - - if (r2inv -int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) { - int matoms=static_cast(PERCENT_GPU_MEMORY*gpu_bytes/ - bytes_per_atom(max_nbors)); - if (matoms>MAX_ATOMS) - matoms=MAX_ATOMS; - return matoms; -} - template bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, double **host_cutsq, double **host_sigma, @@ -41,7 +32,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, const int max_nbors, - const int me) { + const int me, const int nlocal, const int nall) { if (allocated) clear(); @@ -55,9 +46,18 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, time_pair.init(); // Initialize atom and nbor data - max_atoms=get_max_atoms(gpu.bytes(),max_nbors); - atom.init(max_atoms); - nbor.init(ij_size,max_atoms,max_nbors); + max_local=static_cast(static_cast(nlocal)*1.10); + if (max_local==0) + max_local=1000; + if (nall<=nlocal) + max_atoms=max_local*2; + else + max_atoms=static_cast(static_cast(nall)*1.10); + + if (!atom.init(max_atoms)) + return false; + if (!nbor.init(ij_size,max_local,max_nbors)) + return false; // Get a stream for computing pair potentials CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream)); @@ -112,7 +112,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, allocated=true; return true; } - + template void LJ_GPU_MemoryT::clear() { if (!allocated) diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h index f5a407ba0b..3a292cb74c 100644 --- a/lib/gpu/lj_gpu_memory.h +++ b/lib/gpu/lj_gpu_memory.h @@ -28,7 +28,6 @@ #define BLOCK_1D 64 // max value = 256 #define CELL_SIZE BLOCK_1D #define MAX_SHARED_TYPES 8 -#define PERCENT_GPU_MEMORY 0.7 #define BIG_NUMBER 100000000 template @@ -42,14 +41,13 @@ class LJ_GPU_Memory { double **host_sigma, double **host_epsilon, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, - const int max_nbors, const int me); + const int max_nbors, const int me, const int nlocal, + const int nall); /// Free any memory on host and device void clear(); /// Returns memory usage on GPU per atom int bytes_per_atom(const int max_nbors) const; - /// Maximum number of atoms that can be stored on GPU - int get_max_atoms(const size_t gpu_bytes, const int max_nbors); /// Total host memory used by library double host_memory_usage() const; @@ -72,7 +70,7 @@ class LJ_GPU_Memory { NVC_ConstMat< typename nvc_vec_traits::vec2 > lj1, lj3; NVC_VecT special_lj; - size_t max_atoms; + size_t max_atoms, max_local; // Timing for pair calculation NVCTimer time_pair; diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu index 652c4fc22a..6b54f10f41 100644 --- a/lib/gpu/nvc_get_devices.cu +++ b/lib/gpu/nvc_get_devices.cu @@ -8,7 +8,6 @@ certain rights in this software. This software is distributed under the GNU General Public License. - See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- @@ -17,6 +16,12 @@ Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ +#if defined(__APPLE__) +#if _GLIBCXX_ATOMIC_BUILTINS == 1 +#undef _GLIBCXX_ATOMIC_BUILTINS +#endif // _GLIBCXX_ATOMIC_BUILTINS +#endif // __APPLE__ + #include "nvc_device.h" int main(int argc, char** argv) { diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h index 2273b96895..ef61684c46 100644 --- a/lib/gpu/nvc_macros.h +++ b/lib/gpu/nvc_macros.h @@ -20,6 +20,12 @@ #ifndef NVC_MACROS_H #define NVC_MACROS_H +#if defined(__APPLE__) +#if _GLIBCXX_ATOMIC_BUILTINS == 1 +#undef _GLIBCXX_ATOMIC_BUILTINS +#endif // _GLIBCXX_ATOMIC_BUILTINS +#endif // __APPLE__ + #include #include "math_constants.h" #define INT_MUL(x,y) (__mul24(x,y)) diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h index 5b2c164a7f..821ff7b568 100644 --- a/lib/gpu/nvc_memory.h +++ b/lib/gpu/nvc_memory.h @@ -96,6 +96,17 @@ class NVC_Host { _end=_array+cols; } + // Allocate page-locked memory with fast write/slow read on host + inline bool alloc_w(const size_t cols) { + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + if (cudaHostAlloc((void **)&_array,_row_bytes,cudaHostAllocWriteCombined)!= + cudaSuccess) + return false; + _end=_array+cols; + return true; + } + // Allocate page-locked memory with fast read/write on host inline void safe_alloc_rw(const size_t cols) { _cols=cols; @@ -104,6 +115,16 @@ class NVC_Host { _end=_array+cols; } + // Allocate page-locked memory with fast read/write on host + inline bool alloc_rw(const size_t cols) { + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + if (cudaMallocHost((void **)&_array,_row_bytes)!=cudaSuccess) + return false; + _end=_array+cols; + return true; + } + /// Free any memory associated with device inline void clear() { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } } @@ -163,6 +184,13 @@ class NVC_Host { cudaMemcpyHostToDevice,stream)); } + /// Asynchronous copy to device (numel is not bytes) + inline void copy_to_device(size_t offset, numtyp *device_p, size_t numel, + cudaStream_t &stream) { + CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array+offset,numel*sizeof(numtyp), + cudaMemcpyHostToDevice,stream)); + } + /// Asynchronous copy to 2D matrix on device (numel is not bytes) inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, const size_t rows, const size_t cols, @@ -194,6 +222,16 @@ class NVC_Vec { _end=_array+cols; } + // Row vector on device + inline bool alloc(const size_t cols) { + _cols=cols; + _row_bytes=cols*sizeof(numtyp); + if (cudaMalloc((void **)&_array,_row_bytes)!=cudaSuccess) + return false; + _end=_array+cols; + return true; + } + // Row vector on device (allocate and assign texture and bind) inline void safe_alloc(const size_t cols, textureReference *t) { safe_alloc(cols); assign_texture(t); bind(); } @@ -221,11 +259,22 @@ class NVC_Vec { { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), cudaMemcpyHostToDevice)); } + /// Copy from host (n elements) + inline void copy_from_host(const numtyp *host_p, const size_t n) + { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,n*sizeof(numtyp), + cudaMemcpyHostToDevice)); } + /// Asynchronous copy from host inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), cudaMemcpyHostToDevice, stream)); } + /// Asynchronous copy from host (n elements) + inline void copy_from_host(const numtyp *host_p, const size_t n, + cudaStream_t &stream) + { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,n*sizeof(numtyp), + cudaMemcpyHostToDevice, stream)); } + /// Copy to host inline void copy_to_host(numtyp *host_p) { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(), diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu index a5573101d0..7f2f13f3b4 100644 --- a/lib/gpu/pair_gpu_atom.cu +++ b/lib/gpu/pair_gpu_atom.cu @@ -28,7 +28,9 @@ int PairGPUAtomT::bytes_per_atom() const { } template -void PairGPUAtomT::init(const int max_atoms) { +bool PairGPUAtomT::init(const int max_atoms) { + bool success=true; + if (allocated) clear(); @@ -39,28 +41,48 @@ void PairGPUAtomT::init(const int max_atoms) { time_answer.init(); // Device matrices for atom and force data - dev_x.safe_alloc(atom_fields(),max_atoms,x_get_texture()); - ans.safe_alloc(ans_fields(),max_atoms); + success=success && dev_x.alloc(max_atoms*sizeof(vec4)); + success=success && dev_q.alloc(max_atoms*sizeof(vec4)); + success=success && ans.alloc(ans_fields()*max_atoms); + // Get a host read/write buffer + success=success && host_read.alloc_rw(max_atoms*ans_fields()); // Get a host write only buffer - host_write.safe_alloc_w(max_atoms*atom_fields()); - // Get a host read/write buffer - host_read.safe_alloc_rw(ans.row_size()*ans_fields()); + success=success && host_write.alloc_w(max_atoms*atom_fields()); allocated=true; + + return success; } +template +void PairGPUAtomT::resize(const int max_atoms, bool &success) { + ans.clear(); + dev_x.clear(); + dev_q.clear(); + host_write.clear(); + host_read.clear(); + + _max_atoms=max_atoms; + + success = success && dev_x.alloc(_max_atoms*sizeof(vec4)); + success = success && dev_q.alloc(_max_atoms*sizeof(vec4)); + success = success && ans.alloc(ans_fields()*_max_atoms); + success = success && host_read.alloc_rw(_max_atoms*ans_fields()); + success = success && host_write.alloc_w(_max_atoms*atom_fields()); +} + template void PairGPUAtomT::clear() { if (!allocated) return; allocated=false; - dev_x.unbind(); - ans.clear(); + ans.clear(); + dev_x.clear(); + dev_q.clear(); host_write.clear(); host_read.clear(); - dev_x.clear(); } template @@ -82,88 +104,88 @@ void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, if (!vflag) csize-=6; - host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s); + host_read.copy_from_device(ans.begin(),_inum*csize,s); } template double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom, const bool vflag_atom, double *eatom, - double **vatom, double *virial) { + double **vatom, double *virial, + double **f, double **tor, const int n) { double evdwl=0.0; - int gap=ans.row_size()-_inum; acctyp *ap=host_read.begin(); - if (_eflag) { - if (eflag_atom) { - for (int i=0; i<_inum; i++) { + for (int i=0; i<_inum; i++) { + int ii=ilist[i]; + if (_eflag) { + if (eflag_atom) { evdwl+=*ap; - eatom[ilist[i]]+=*ap*0.5; + eatom[ii]+=*ap*0.5; ap++; - } - } else - for (int i=0; i<_inum; i++) { + } else { evdwl+=*ap; ap++; } - ap+=gap; - evdwl*=0.5; - } - _read_loc=ap; - gap=ans.row_size(); - if (_vflag) { - if (vflag_atom) { - for (int ii=0; ii<_inum; ii++) { - int i=ilist[ii]; - ap=_read_loc+ii; + } + if (_vflag) { + if (vflag_atom) { for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; + vatom[ii][j]+=*ap*0.5; virial[j]+=*ap; - ap+=gap; + ap++; } - } - } else { - for (int ii=0; ii<_inum; ii++) { - ap=_read_loc+ii; + } else { for (int j=0; j<6; j++) { virial[j]+=*ap; - ap+=gap; + ap++; } } } - for (int j=0; j<6; j++) - virial[j]*=0.5; - _read_loc+=gap*6; + f[ii][0]+=*ap; + ap++; + f[ii][1]+=*ap; + ap++; + f[ii][2]+=*ap; + ap++; + if (i -void PairGPUAtomT::add_forces(const int *ilist, double **f) { - int gap=ans.row_size(); - for (int ii=0; ii<_inum; ii++) { - acctyp *ap=_read_loc+ii; - int i=ilist[ii]; - f[i][0]+=*ap; - ap+=gap; - f[i][1]+=*ap; - ap+=gap; - f[i][2]+=*ap; - } -} - -template -void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) { - int gap=ans.row_size(); - _read_loc+=gap*3; - for (int ii=0; ii + inline void add_q_data(const cpytyp *host_ptr) { + const int end=_nall*4; + for (int i=0; i ans; + // atom coordinates + NVC_Vec dev_x; + // quaterions + NVC_Vec dev_q; + // ans_fields() + // example: if (eflag and vflag) 1 is energy, 2-7 is virial + NVC_Vec ans; // Buffer for moving floating point data to GPU NVC_HostT host_write; diff --git a/lib/gpu/pair_gpu_cell.cu b/lib/gpu/pair_gpu_cell.cu index 2f167df15f..3e6aa39bc5 100644 --- a/lib/gpu/pair_gpu_cell.cu +++ b/lib/gpu/pair_gpu_cell.cu @@ -17,6 +17,12 @@ Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ +#if defined(__APPLE__) +#if _GLIBCXX_ATOMIC_BUILTINS == 1 +#undef _GLIBCXX_ATOMIC_BUILTINS +#endif // _GLIBCXX_ATOMIC_BUILTINS +#endif // __APPLE__ + #include #include "lj_gpu_memory.h" #include "pair_gpu_cell.h" diff --git a/lib/gpu/pair_gpu_nbor.cu b/lib/gpu/pair_gpu_nbor.cu index 88fdef8dfe..20fba00691 100644 --- a/lib/gpu/pair_gpu_nbor.cu +++ b/lib/gpu/pair_gpu_nbor.cu @@ -26,8 +26,9 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const { return (max_nbors+3)*sizeof(int); } -void PairGPUNbor::init(const int ij_size, const int max_atoms, +bool PairGPUNbor::init(const int ij_size, const int max_atoms, const int max_nbors) { + bool success=true; if (allocated) clear(); @@ -35,16 +36,29 @@ void PairGPUNbor::init(const int ij_size, const int max_atoms, time_nbor.init(); if (_use_packing) - dev_nbor.safe_alloc(max_nbors+4,max_atoms); + success=success && dev_nbor.alloc((max_nbors+4)*max_atoms); else - dev_nbor.safe_alloc(3,max_atoms); + success=success && dev_nbor.alloc(3*max_atoms); - ij.safe_alloc(max_nbors*max_atoms); - host_ij.safe_alloc_w(ij_size); + success=success && ij.alloc(max_nbors*max_atoms); + success=success && host_ij.alloc_w(ij_size); allocated=true; + + return success; } +void PairGPUNbor::resize(const int nlocal, const int max_nbor, bool &success) { + dev_nbor.clear(); + ij.clear(); + if (_use_packing) + success=success && dev_nbor.alloc((max_nbor+4)*nlocal); + else + success=success && dev_nbor.alloc(3*nlocal); + success=success && ij.alloc(max_nbor*nlocal); + allocated=true; +} + void PairGPUNbor::clear() { if (!allocated) return; @@ -54,7 +68,7 @@ void PairGPUNbor::clear() { host_ij.clear(); dev_nbor.clear(); } - + double PairGPUNbor::host_memory_usage() const { return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor); } @@ -65,12 +79,35 @@ void PairGPUNbor::reset(const int inum, int *ilist, const int *numj, dev_nbor.copy_from_host(ilist,inum); int acc=0; - for (int i=0; i0) { + host_ij.copy_to_device(dev_nbor.begin()+inum+offset,hi,s); + host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,hi,s); + } + } } diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h index 6d19972530..c505120785 100644 --- a/lib/gpu/pair_gpu_nbor.h +++ b/lib/gpu/pair_gpu_nbor.h @@ -40,8 +40,10 @@ class PairGPUNbor { void packing(const bool use_packing) { _use_packing=use_packing; } /// Called once to allocate memory - void init(const int ij_size, const int max_atoms, const int max_nbors); + bool init(const int ij_size, const int max_atoms, const int max_nbors); + void resize(const int nlocal, const int max_nbor, bool &success); + /// Free all memory on host and device void clear(); @@ -73,7 +75,7 @@ class PairGPUNbor { // - 1st row is i // - 2nd row is numj (number of neighbors) // - 3rd row is starting address in host_ij of neighbors - NVC_MatI dev_nbor; + NVC_VecI dev_nbor; // --------------- Timing Stuff NVCTimer time_nbor; diff --git a/lib/gpu/pair_gpu_texture.h b/lib/gpu/pair_gpu_texture.h index accc4e6b29..6c26b47952 100644 --- a/lib/gpu/pair_gpu_texture.h +++ b/lib/gpu/pair_gpu_texture.h @@ -31,32 +31,6 @@ #define GB_GPU_DOUBLE #endif -// ------------------------------- x ------------------------------------ - -static texture x_float_tex; -static texture x_double_tex; -template inline textureReference * x_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"x_float_tex"); - return const_cast(ptr); -} -template <> inline textureReference * x_get_texture() { - const textureReference *ptr; - cudaGetTextureReference(&ptr,"x_double_tex"); - return const_cast(ptr); -} -template -static __inline__ __device__ numtyp _x_(const int i, const int j) { - return tex2D(x_float_tex,i,j); -} -#ifdef GB_GPU_DOUBLE -template <> -static __inline__ __device__ double _x_(const int i,const int j) { - int2 t=tex2D(x_double_tex,i,j); - return __hiloint2double(t.y, t.x); -} -#endif - // ------------------------------- form ------------------------------------ static texture form_tex; diff --git a/lib/gpu/pair_tex_tar.cu b/lib/gpu/pair_tex_tar.cu index b9a275ee22..426d2ff059 100644 --- a/lib/gpu/pair_tex_tar.cu +++ b/lib/gpu/pair_tex_tar.cu @@ -17,6 +17,12 @@ Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ +#if defined(__APPLE__) +#if _GLIBCXX_ATOMIC_BUILTINS == 1 +#undef _GLIBCXX_ATOMIC_BUILTINS +#endif // _GLIBCXX_ATOMIC_BUILTINS +#endif // __APPLE__ + #include "pair_gpu_atom.cu" #include "lj_gpu.cu" #include "gb_gpu.cu"