From 7510ec796096b1d1e70d48b2f04b53442c5943f2 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Wed, 29 Oct 2014 15:47:24 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12655 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/geryon/VERSION.txt | 2 +- lib/gpu/geryon/nvd_device.h | 6 ++ lib/gpu/geryon/ocl_device.h | 13 ++- lib/gpu/geryon/ocl_memory.h | 5 +- lib/gpu/lal_answer.cpp | 16 ++- lib/gpu/lal_aux_fun1.h | 71 +++++++------- lib/gpu/lal_base_ellipsoid.cpp | 8 +- lib/gpu/lal_beck.cu | 16 +-- lib/gpu/lal_beck_ext.cpp | 6 +- lib/gpu/lal_born.cu | 16 +-- lib/gpu/lal_born_coul_long.cu | 16 +-- lib/gpu/lal_born_coul_long_ext.cpp | 6 +- lib/gpu/lal_born_coul_wolf.cu | 44 +++++---- lib/gpu/lal_born_coul_wolf_ext.cpp | 6 +- lib/gpu/lal_born_ext.cpp | 6 +- lib/gpu/lal_buck.cu | 16 +-- lib/gpu/lal_buck_coul.cu | 16 +-- lib/gpu/lal_buck_coul_ext.cpp | 6 +- lib/gpu/lal_buck_coul_long.cu | 16 +-- lib/gpu/lal_buck_coul_long_ext.cpp | 6 +- lib/gpu/lal_buck_ext.cpp | 6 +- lib/gpu/lal_cg_cmm.cu | 16 +-- lib/gpu/lal_cg_cmm_ext.cpp | 6 +- lib/gpu/lal_cg_cmm_long.cu | 16 +-- lib/gpu/lal_cg_cmm_long_ext.cpp | 6 +- lib/gpu/lal_charmm_long.cpp | 11 ++- lib/gpu/lal_charmm_long.cu | 28 ++---- lib/gpu/lal_charmm_long_ext.cpp | 6 +- lib/gpu/lal_colloid.cu | 16 +-- lib/gpu/lal_colloid_ext.cpp | 6 +- lib/gpu/lal_coul.cu | 23 +++-- lib/gpu/lal_coul_debye.cu | 52 +++++----- lib/gpu/lal_coul_dsf.cu | 16 +-- lib/gpu/lal_coul_dsf_ext.cpp | 6 +- lib/gpu/lal_coul_long.cu | 54 +++++----- lib/gpu/lal_coul_long_ext.cpp | 6 +- lib/gpu/lal_device.cpp | 23 +++-- lib/gpu/lal_device.h | 10 +- lib/gpu/lal_dipole_lj.cu | 44 ++++----- lib/gpu/lal_dipole_lj_ext.cpp | 6 +- lib/gpu/lal_dipole_lj_sf.cu | 44 ++++----- lib/gpu/lal_dipole_lj_sf_ext.cpp | 6 +- lib/gpu/lal_dpd.cu | 18 ++-- lib/gpu/lal_eam.cu | 51 +++++----- lib/gpu/lal_eam_ext.cpp | 6 +- lib/gpu/lal_ellipsoid_extra.h | 15 ++- lib/gpu/lal_ellipsoid_nbor.cu | 34 +++---- lib/gpu/lal_gauss.cu | 16 +-- lib/gpu/lal_gauss_ext.cpp | 6 +- lib/gpu/lal_gayberne.cu | 4 +- lib/gpu/lal_gayberne_ext.cpp | 6 +- lib/gpu/lal_gayberne_lj.cu | 20 ++-- lib/gpu/lal_lj.cu | 26 ++--- lib/gpu/lal_lj96.cu | 16 +-- lib/gpu/lal_lj96_ext.cpp | 6 +- lib/gpu/lal_lj_class2_long.cu | 16 +-- lib/gpu/lal_lj_class2_long_ext.cpp | 6 +- lib/gpu/lal_lj_coul.cu | 16 +-- lib/gpu/lal_lj_coul_debye.cu | 16 +-- lib/gpu/lal_lj_coul_debye_ext.cpp | 6 +- lib/gpu/lal_lj_coul_ext.cpp | 6 +- lib/gpu/lal_lj_coul_long.cpp | 19 +++- lib/gpu/lal_lj_coul_long.cu | 16 +-- lib/gpu/lal_lj_coul_long.h | 5 + lib/gpu/lal_lj_coul_long_ext.cpp | 29 +++++- lib/gpu/lal_lj_coul_msm.cu | 16 +-- lib/gpu/lal_lj_coul_msm_ext.cpp | 6 +- lib/gpu/lal_lj_dsf.cu | 16 +-- lib/gpu/lal_lj_dsf_ext.cpp | 6 +- lib/gpu/lal_lj_expand.cu | 16 +-- lib/gpu/lal_lj_expand_ext.cpp | 6 +- lib/gpu/lal_lj_ext.cpp | 6 +- lib/gpu/lal_lj_gromacs.cu | 18 ++-- lib/gpu/lal_mie.cu | 16 +-- lib/gpu/lal_mie_ext.cpp | 6 +- lib/gpu/lal_morse.cu | 16 +-- lib/gpu/lal_morse_ext.cpp | 6 +- lib/gpu/lal_neighbor.cpp | 7 +- lib/gpu/lal_neighbor.h | 1 + lib/gpu/lal_neighbor_cpu.cu | 10 +- lib/gpu/lal_pppm.cpp | 2 +- lib/gpu/lal_pppm_ext.cpp | 6 +- lib/gpu/lal_preprocessor.h | 50 ++++++++++ lib/gpu/lal_re_squared.cu | 4 +- lib/gpu/lal_re_squared_ext.cpp | 6 +- lib/gpu/lal_re_squared_lj.cu | 24 ++--- lib/gpu/lal_soft.cu | 16 +-- lib/gpu/lal_soft_ext.cpp | 6 +- lib/gpu/lal_sw.cu | 152 ++++++++++++++++------------- lib/gpu/lal_table.cu | 64 ++++++------ lib/gpu/lal_table_ext.cpp | 6 +- lib/gpu/lal_yukawa.cu | 16 +-- lib/gpu/lal_yukawa_colloid.cu | 16 +-- lib/gpu/lal_yukawa_colloid_ext.cpp | 6 +- lib/gpu/lal_yukawa_ext.cpp | 6 +- 95 files changed, 850 insertions(+), 745 deletions(-) diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt index 733136e842..39b132ac54 100644 --- a/lib/gpu/geryon/VERSION.txt +++ b/lib/gpu/geryon/VERSION.txt @@ -1 +1 @@ -Geryon Version 13.209 +Geryon Version 13.234 diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 5fffe77c82..12a18ae873 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -159,6 +159,12 @@ class UCL_Device { /// Returns true if double precision is support for the device inline bool double_precision(const int i) {return arch(i)>=1.3;} + /// Get the number of compute units on the current device + inline unsigned cus() { return cus(_device); } + /// Get the number of compute units + inline unsigned cus(const int i) + { return _properties[i].multiProcessorCount; } + /// Get the number of cores in the current device inline unsigned cores() { return cores(_device); } /// Get the number of cores diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 79fa53d552..8dadcf2efd 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -185,13 +185,12 @@ class UCL_Device { inline bool double_precision(const int i) {return _properties[i].double_precision;} - /// Get the number of cores in the current device - inline unsigned cores() { return cores(_device); } - /// Get the number of cores - inline unsigned cores(const int i) - { if (device_type(i)==UCL_CPU) return _properties[i].compute_units; - else return _properties[i].compute_units*8; } - + /// Get the number of compute units on the current device + inline unsigned cus() { return cus(_device); } + /// Get the number of compute units + inline unsigned cus(const int i) + { return _properties[i].compute_units; } + /// Get the gigabytes of global memory in the current device inline double gigabytes() { return gigabytes(_device); } /// Get the gigabytes of global memory diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index 09089a52ce..7aed0a1a8c 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -127,7 +127,8 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) { orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR; mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n, - mat.host_ptr(), &error_flag); + *mat.host_ptr(), &error_flag); + CL_CHECK_ERR(error_flag); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); return UCL_SUCCESS; @@ -174,7 +175,7 @@ template inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) { cl_int error_flag; mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR, - n,mat.host_ptr(),&error_flag); + n,*mat.host_ptr(),&error_flag); CL_CHECK_ERR(error_flag); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); return UCL_SUCCESS; diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index ddf893e4ed..c1f4999207 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -221,7 +221,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom, return energy_virial(eatom,vatom,virial); double evdwl=0.0; - int vstart=0, iend=_inum; + int ii, vstart=0, iend=_inum; if (_eflag) { iend=_inum*2; for (int i=0; i<_inum; i++) @@ -235,10 +235,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom, for (int i=_inum; i0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -123,17 +122,17 @@ } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ - *engv=e_coul*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -158,15 +157,15 @@ } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -190,17 +189,17 @@ } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ - *engv=e_coul*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index dd83bfa9a4..4200c02e1c 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -202,6 +202,7 @@ void BaseEllipsoidT::output_times() { MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, device->replica()); double max_mb=mpi_max_bytes/(1024*1024); + double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5]; if (device->replica_me()==0) if (screen && times[5]>0.0) { @@ -209,11 +210,11 @@ void BaseEllipsoidT::output_times() { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," GPU Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (device->procs_per_gpu()==1) { + if (device->procs_per_gpu()==1 && t_time>0) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); @@ -226,7 +227,8 @@ void BaseEllipsoidT::output_times() { } if (nbor->gpu_nbor()==2) fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size); - fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size); + if (times[6]>0) + fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu index 479587047c..7ccefd8859 100644 --- a/lib/gpu/lal_beck.cu +++ b/lib/gpu/lal_beck.cu @@ -52,19 +52,19 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii0) { - if (rsq < cut_coulsq) - e_coul += v_sh*factor_coul; + if (rsq < cut_coulsq) { + numtyp e=v_sh; + if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; + e_coul += e; + } if (rsq < cutsq_sigma[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; @@ -201,11 +205,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, __syncthreads(); if (ii0) { - if (rsq < cut_coulsq) - e_coul += v_sh*factor_coul; + if (rsq < cut_coulsq) { + numtyp e=v_sh; + if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; + e_coul += e; + } if (rsq < cutsq_sigma[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp index 5c9e2c02bf..b56c526119 100644 --- a/lib/gpu/lal_born_coul_wolf_ext.cpp +++ b/lib/gpu/lal_born_coul_wolf_ext.cpp @@ -52,7 +52,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } @@ -72,9 +72,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, for (int i=0; i_block_bio_size>=64 && mix_arithmetic) + int max_bio_shared_types=this->device->max_bio_shared_types(); + if (this->_block_bio_size>=64 && mix_arithmetic && + lj_types<=max_bio_shared_types) shared_types=true; _lj_types=lj_types; // Allocate a host write buffer for data initialization int h_size=lj_types*lj_types; - int max_bio_shared_types=this->device->max_bio_shared_types(); if (h_size host_write(h_size*32,*(this->ucl_device), @@ -84,8 +85,10 @@ int CHARMMLongT::init(const int ntypes, this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, host_lj3,host_lj4); - ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY); - this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma); + if (shared_types) { + ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma); + } sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu index 3e473a031a..dde50da300 100644 --- a/lib/gpu/lal_charmm_long.cu +++ b/lib/gpu/lal_charmm_long.cu @@ -32,7 +32,7 @@ texture q_tex; __kernel void k_charmm_long(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict lj1, const int lj_types, - const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_lj, const __global int *dev_nbor, const __global int *dev_packed, __global acctyp4 *restrict ans, @@ -47,16 +47,6 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - __local numtyp sp_lj[8]; - sp_lj[0]=sp_lj_in[0]; - sp_lj[1]=sp_lj_in[1]; - sp_lj[2]=sp_lj_in[2]; - sp_lj[3]=sp_lj_in[3]; - sp_lj[4]=sp_lj_in[4]; - sp_lj[5]=sp_lj_in[5]; - sp_lj[6]=sp_lj_in[6]; - sp_lj[7]=sp_lj_in[7]; - acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; @@ -66,18 +56,18 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii q_tex; #endif __kernel void k_coul_debye(const __global numtyp4 *restrict x_, - const __global numtyp *restrict scale, - const int lj_types, - const __global numtyp *restrict sp_cl_in, - const __global int *dev_nbor, - const __global int *dev_packed, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, - const __global numtyp *restrict q_ , - const __global numtyp *restrict cutsq, - const numtyp qqrd2e, const numtyp kappa, - const int t_per_atom) { + const __global numtyp *restrict scale, + const int lj_types, + const __global numtyp *restrict sp_cl_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_ , + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const numtyp kappa, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -61,20 +61,20 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii q_tex; #endif __kernel void k_coul_long(const __global numtyp4 *restrict x_, - const __global numtyp *restrict scale, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, const int lj_types, const __global numtyp *restrict sp_cl_in, const __global int *dev_nbor, @@ -153,44 +154,41 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii0) { - e_coul += prefactor*(_erfc-factor_coul); + e_coul += prefactor*(_erfc-factor_coul); } if (vflag>0) { virial[0] += delx*delx*force; @@ -217,7 +215,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, } __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, - const __global numtyp *restrict scale_in, + const __global numtyp4 *restrict lj1_in, + const __global numtyp4 *restrict lj3_in, const __global numtyp *restrict sp_cl_in, const __global int *dev_nbor, const __global int *dev_packed, @@ -231,14 +230,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); - __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_cl[4]; if (tid<4) sp_cl[tid]=sp_cl_in[tid]; - if (tid0) { - e_coul += prefactor*(_erfc-factor_coul); + e_coul += prefactor*(_erfc-factor_coul); } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp index 49d593d9a0..5552dc2437 100644 --- a/lib/gpu/lal_coul_long_ext.cpp +++ b/lib/gpu/lal_coul_long_ext.cpp @@ -48,7 +48,7 @@ int cl_gpu_init(const int ntypes, double **host_scale, message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } @@ -65,9 +65,9 @@ int cl_gpu_init(const int ntypes, double **host_scale, for (int i=0; iname(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ + sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ ("; else - sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ + sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+ toa(gpu->clock_rate(i))+" GHZ ("; if (sizeof(PRECISION)==4) { if (sizeof(ACC_PRECISION)==4) @@ -520,7 +527,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, if (screen && times[5]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," Device Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); @@ -582,7 +589,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in, if (screen && times[6]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," Device Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); @@ -672,7 +679,7 @@ int DeviceT::compile_kernels() { _threads_per_charge=gpu_lib_data[13]; _pppm_max_spline=gpu_lib_data[4]; _pppm_block=gpu_lib_data[5]; - _block_pair=gpu_lib_data[6]; + if (_block_pair == -1) _block_pair=gpu_lib_data[6]; _max_shared_types=gpu_lib_data[7]; _block_cell_2d=gpu_lib_data[8]; _block_cell_id=gpu_lib_data[9]; @@ -714,10 +721,10 @@ int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, const int t_per_atom, const double cell_size, - char *opencl_vendor) { + char *opencl_vendor, const int block_pair) { return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, particle_split,nthreads,t_per_atom, - cell_size,opencl_vendor); + cell_size,opencl_vendor,block_pair); } void lmp_clear_device() { diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 9a767e74b3..77321f5462 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -51,7 +51,7 @@ class Device { const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, const int t_per_atom, const double cell_size, - char *vendor_string); + char *vendor_string, const int block_pair); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored @@ -96,12 +96,12 @@ class Device { const int first_gpu, const int last_gpu); /// Perform charge assignment asynchronously for PPPM - void set_single_precompute(PPPM *pppm); + void set_single_precompute(PPPM *pppm); /// Perform charge assignment asynchronously for PPPM - void set_double_precompute(PPPM *pppm); + void set_double_precompute(PPPM *pppm); /// Esimate the overhead from GPU calls from multiple procs /** \param kernel_calls Number of kernel calls/timestep for timing estimated diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu index e5ca559dab..b6483d1ef8 100644 --- a/lib/gpu/lal_dipole_lj.cu +++ b/lib/gpu/lal_dipole_lj.cu @@ -73,17 +73,17 @@ texture mu_tex; } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ - *engv=e_coul*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -113,17 +113,17 @@ texture mu_tex; } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ - *engv=e_coul*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -173,19 +173,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii mu_tex; } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ - *engv=e_coul*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -114,17 +114,17 @@ texture mu_tex; } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv=energy*(acctyp)0.5; \ - engv+=inum; \ - *engv=e_coul*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=energy*(acctyp)0.5; \ + ei+=inum; \ + engv[ei]=e_coul*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -174,19 +174,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii z2r_sp2_tex; } \ } \ if (offset==0) { \ + int ei=ii; \ if (eflag>0) { \ - engv[ii]+=energy*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - engv[ii]=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -171,15 +172,15 @@ texture z2r_sp2_tex; } \ } \ if (offset==0) { \ - engv+=ii; \ + int ei=ii; \ if (eflag>0) { \ - *engv+=energy*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]*(acctyp)0.5; \ - engv+=inum; \ + engv[ei]=virial[i]*(acctyp)0.5; \ + ei+=inum; \ } \ } \ ans[ii]=f; \ @@ -209,17 +210,17 @@ __kernel void k_energy(const __global numtyp4 *restrict x_, acctyp energy = (acctyp)0; if (ii pos_tex, quat_tex; #endif #define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ - i, numj, stride, list_end, nbor) \ - nbor=nbor_mem+ii; \ - i=*nbor; \ - nbor+=nbor_stride; \ - numj=*nbor; \ - nbor+=nbor_stride; \ - list_end=nbor+fast_mul(nbor_stride,numj); \ - nbor+=fast_mul(offset,nbor_stride); \ + i, numj, stride, nbor_end, nbor_begin) \ + i=nbor_mem[ii]; \ + nbor_begin=ii+nbor_stride; \ + numj=nbor_mem[nbor_begin]; \ + nbor_begin+=nbor_stride; \ + nbor_end=nbor_begin+fast_mul(nbor_stride,numj); \ + nbor_begin+=fast_mul(offset,nbor_stride); \ stride=fast_mul(t_per_atom,nbor_stride); #if (ARCH < 300) diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu index 47ee173a4b..30d864aecc 100644 --- a/lib/gpu/lal_ellipsoid_nbor.cu +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -41,20 +41,19 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_, int ii=GLOBAL_ID_X+start; if (iiucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_cutsq, host_cut_ljsq); + host_cutsq, host_cut_ljsq); lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, @@ -102,6 +102,23 @@ int LJCoulLongT::init(const int ntypes, return 0; } +template +void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_cut_ljsq) { + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); +} + template void LJCoulLongT::clear() { if (!_allocated) diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu index 7d838b7c17..e0aa2e8a58 100644 --- a/lib/gpu/lal_lj_coul_long.cu +++ b/lib/gpu/lal_lj_coul_long.cu @@ -65,18 +65,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii { const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_cut_ljsq); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp index b769446d43..dc93365f22 100644 --- a/lib/gpu/lal_lj_coul_long_ext.cpp +++ b/lib/gpu/lal_lj_coul_long_ext.cpp @@ -51,7 +51,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } @@ -69,9 +69,9 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, for (int i=0; iworld_me(); + int gpu_rank=LJCLMF.device->gpu_rank(); + int procs_per_gpu=LJCLMF.device->procs_per_gpu(); + + if (world_me==0) + LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, host_cut_ljsq); + LJCLMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + void ljcl_gpu_clear() { LJCLMF.clear(); } diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu index 5be8fe98f0..0c7c3cdace 100644 --- a/lib/gpu/lal_lj_coul_msm.cu +++ b/lib/gpu/lal_lj_coul_msm.cu @@ -118,11 +118,11 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, virial[i]=(acctyp)0; if (ii0) { diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index d54aa439b0..7653291bbb 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -227,6 +227,7 @@ class Neighbor { int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial; bool _gpu_host, _alloc_packed; double _cutoff, _cell_size, _bin_time; + enum UCL_MEMOPT _packed_permissions; double _gpu_bytes, _c_bytes, _cell_bytes; void alloc(bool &success); diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu index 1a1d392032..384b88d9de 100644 --- a/lib/gpu/lal_neighbor_cpu.cu +++ b/lib/gpu/lal_neighbor_cpu.cu @@ -25,17 +25,17 @@ __kernel void kernel_unpack(__global int *dev_nbor, int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; if (ii sw3_ijparam.y) continue; numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); + sw_sigma=sw1_ijparam.y; + sw_gamma=sw1_ijparam.w; sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; sw_cut_ij=sw3_ijparam.x; - const __global int *nbor_k=nbor_j-offset_j+offset_k; + int nbor_k=nbor_j-offset_j+offset_k; if (nbor_k<=nbor_j) nbor_k+=n_stride; - for ( ; nbor_k sw3_ijparam.y) continue; - int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype]; - numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex); - numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex); - sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma; - sw_cut_ij=sw3_jiparam.x; + numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); + sw_sigma=sw1_ijparam.y; + sw_gamma=sw1_ijparam.w; + sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; + sw_cut_ij=sw3_ijparam.x; - const __global int *nbor_k=dev_nbor+j+nbor_pitch; - int numk=*nbor_k; + int nbor_k=j+nbor_pitch; + int numk=dev_nbor[nbor_k]; if (dev_nbor==dev_packed) { nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { nbor_k+=nbor_pitch; - nbor_k=dev_packed+*nbor_k; + nbor_k=dev_nbor[nbor_k]; k_end=nbor_k+numk; nbor_k+=offset_k; } for ( ; nbor_k sw3_ijparam.y) continue; - int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype]; - numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex); - numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex); - sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma; - sw_cut_ij=sw3_jiparam.x; - - const __global int *nbor_k=dev_nbor+j+nbor_pitch; - int numk=*nbor_k; + numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex); + sw_sigma=sw1_ijparam.y; + sw_gamma=sw1_ijparam.w; + sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; + sw_cut_ij=sw3_ijparam.x; + + int nbor_k=j+nbor_pitch; + int numk=dev_nbor[nbor_k]; if (dev_nbor==dev_packed) { nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1); k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1)); nbor_k+=offset_k; } else { nbor_k+=nbor_pitch; - nbor_k=dev_packed+*nbor_k; + nbor_k=dev_nbor[nbor_k]; k_end=nbor_k+numk; nbor_k+=offset_k; } for ( ; nbor_k