forked from lijiext/lammps
362 lines
12 KiB
C++
362 lines
12 KiB
C++
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
|
------------------------------------------------------------------------- */
|
|
|
|
#ifdef USE_OPENCL
|
|
#include "gb_gpu_cl.h"
|
|
#include "gb_gpu_nbor_cl.h"
|
|
#else
|
|
#include "gb_gpu_ptx.h"
|
|
#endif
|
|
|
|
#include "gb_gpu_memory.h"
|
|
#include <cassert>
|
|
#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
|
|
|
|
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
|
|
|
template <class numtyp, class acctyp>
|
|
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
|
|
_max_bytes(0.0) {
|
|
device=&pair_gpu_device;
|
|
ans=new PairGPUAns<numtyp,acctyp>();
|
|
nbor=new PairGPUNbor;
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
|
clear();
|
|
delete ans;
|
|
delete nbor;
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
|
nbor->bytes_per_atom(max_nbors);
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
int GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
|
const double upsilon, const double mu,
|
|
double **host_shape, double **host_well,
|
|
double **host_cutsq, double **host_sigma,
|
|
double **host_epsilon, double *host_lshape,
|
|
int **h_form, double **host_lj1, double **host_lj2,
|
|
double **host_lj3, double **host_lj4,
|
|
double **host_offset, const double *host_special_lj,
|
|
const int nlocal, const int nall,
|
|
const int max_nbors, const double cell_size,
|
|
const double gpu_split, FILE *_screen) {
|
|
nbor_time_avail=false;
|
|
screen=_screen;
|
|
|
|
bool gpu_nbor=false;
|
|
if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
|
|
gpu_nbor=true;
|
|
|
|
int _gpu_host=0;
|
|
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
|
if (host_nlocal>0)
|
|
_gpu_host=1;
|
|
|
|
_threads_per_atom=device->threads_per_atom();
|
|
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
|
|
_gpu_host,max_nbors,cell_size,true);
|
|
if (success!=0)
|
|
return success;
|
|
|
|
ucl_device=device->gpu;
|
|
atom=&device->atom;
|
|
|
|
_block_size=device->pair_block_size();
|
|
compile_kernels(*ucl_device);
|
|
|
|
// Initialize host-device load balancer
|
|
hd_balancer.init(device,gpu_nbor,gpu_split);
|
|
|
|
// Initialize timers for the selected GPU
|
|
time_pair.init(*ucl_device);
|
|
time_pair.zero();
|
|
|
|
// If atom type constants fit in shared memory use fast kernel
|
|
int lj_types=ntypes;
|
|
shared_types=false;
|
|
int max_shared_types=device->max_shared_types();
|
|
if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
|
|
lj_types=max_shared_types;
|
|
shared_types=true;
|
|
}
|
|
_lj_types=lj_types;
|
|
|
|
// Allocate a host write buffer for copying type data
|
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*ucl_device,
|
|
UCL_WRITE_OPTIMIZED);
|
|
|
|
for (int i=0; i<lj_types*lj_types; i++)
|
|
host_write[i]=0.0;
|
|
|
|
sigma_epsilon.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
|
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
|
|
host_sigma,host_epsilon);
|
|
|
|
cut_form.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
|
this->atom->type_pack2(ntypes,lj_types,cut_form,host_write,
|
|
host_cutsq,h_form);
|
|
|
|
lj1.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
|
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
|
host_cutsq,h_form);
|
|
|
|
lj3.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
|
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
|
host_offset);
|
|
|
|
dev_error.alloc(1,*ucl_device);
|
|
dev_error.zero();
|
|
|
|
_allocated=true;
|
|
|
|
host_form=h_form;
|
|
|
|
// Initialize timers for the selected GPU
|
|
time_kernel.init(*ucl_device);
|
|
time_gayberne.init(*ucl_device);
|
|
time_kernel2.init(*ucl_device);
|
|
time_gayberne2.init(*ucl_device);
|
|
time_kernel.zero();
|
|
time_gayberne.zero();
|
|
time_kernel2.zero();
|
|
time_gayberne2.zero();
|
|
|
|
// Allocate, cast and asynchronous memcpy of constant data
|
|
// Copy data for bonded interactions
|
|
gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY);
|
|
host_write[0]=static_cast<numtyp>(gamma);
|
|
host_write[1]=static_cast<numtyp>(upsilon);
|
|
host_write[2]=static_cast<numtyp>(mu);
|
|
host_write[3]=static_cast<numtyp>(host_special_lj[0]);
|
|
host_write[4]=static_cast<numtyp>(host_special_lj[1]);
|
|
host_write[5]=static_cast<numtyp>(host_special_lj[2]);
|
|
host_write[6]=static_cast<numtyp>(host_special_lj[3]);
|
|
ucl_copy(gamma_upsilon_mu,host_write,7,false);
|
|
|
|
lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
|
|
UCL_H_Vec<double> d_view;
|
|
d_view.view(host_lshape,lshape.numel(),*ucl_device);
|
|
ucl_copy(lshape,d_view,false);
|
|
|
|
// Copy shape, well, sigma, epsilon, and cutsq onto GPU
|
|
// - cast if necessary
|
|
shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
|
|
for (int i=0; i<ntypes; i++) {
|
|
host_write[i*4]=host_shape[i][0];
|
|
host_write[i*4+1]=host_shape[i][1];
|
|
host_write[i*4+2]=host_shape[i][2];
|
|
}
|
|
UCL_H_Vec<numtyp4> view4;
|
|
view4.view((numtyp4*)host_write.begin(),shape.numel(),*ucl_device);
|
|
ucl_copy(shape,view4,false);
|
|
|
|
well.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
|
|
for (int i=0; i<ntypes; i++) {
|
|
host_write[i*4]=host_well[i][0];
|
|
host_write[i*4+1]=host_well[i][1];
|
|
host_write[i*4+2]=host_well[i][2];
|
|
}
|
|
view4.view((numtyp4*)host_write.begin(),well.numel(),*ucl_device);
|
|
ucl_copy(well,view4,false);
|
|
|
|
// See if we want fast GB-sphere or sphere-sphere calculations
|
|
multiple_forms=false;
|
|
for (int i=1; i<ntypes; i++)
|
|
for (int j=i; j<ntypes; j++)
|
|
if (host_form[i][j]!=ELLIPSE_ELLIPSE)
|
|
multiple_forms=true;
|
|
if (multiple_forms && host_nlocal>0) {
|
|
std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
|
|
exit(1);
|
|
}
|
|
|
|
if (multiple_forms)
|
|
ans->dev_ans.zero();
|
|
|
|
_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
|
|
|
// Memory for ilist ordered by particle type
|
|
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
|
|
return 0;
|
|
else return -3;
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
void GB_GPU_MemoryT::estimate_gpu_overhead() {
|
|
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
void GB_GPU_MemoryT::clear() {
|
|
if (!_allocated)
|
|
return;
|
|
|
|
UCL_H_Vec<int> err_flag(1,*ucl_device);
|
|
ucl_copy(err_flag,dev_error,false);
|
|
if (err_flag[0] == 2)
|
|
std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
|
|
err_flag.clear();
|
|
|
|
_allocated=false;
|
|
|
|
// Output any timing information
|
|
acc_timers();
|
|
double single[9], times[9];
|
|
|
|
single[0]=atom->transfer_time()+ans->transfer_time();
|
|
single[1]=nbor->time_nbor.total_seconds();
|
|
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
|
|
nbor->time_kernel.total_seconds();
|
|
single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds();
|
|
if (multiple_forms)
|
|
single[4]=time_pair.total_seconds();
|
|
else
|
|
single[4]=0;
|
|
single[5]=atom->cast_time()+ans->cast_time();
|
|
single[6]=_gpu_overhead;
|
|
single[7]=_driver_overhead;
|
|
single[8]=ans->cpu_idle_time();
|
|
|
|
MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
|
double avg_split=hd_balancer.all_avg_split();
|
|
|
|
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
|
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
|
|
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
|
gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
|
|
double mpi_max_bytes;
|
|
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
|
device->replica());
|
|
double max_mb=mpi_max_bytes/(1024*1024);
|
|
|
|
if (device->replica_me()==0)
|
|
if (screen && times[3]>0.0) {
|
|
int replica_size=device->replica_size();
|
|
|
|
fprintf(screen,"\n\n-------------------------------------");
|
|
fprintf(screen,"--------------------------------\n");
|
|
fprintf(screen," GPU Time Info (average): ");
|
|
fprintf(screen,"\n-------------------------------------");
|
|
fprintf(screen,"--------------------------------\n");
|
|
|
|
if (device->procs_per_gpu()==1) {
|
|
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
|
|
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
|
|
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
|
|
if (nbor->gpu_nbor())
|
|
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
|
|
else
|
|
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
|
|
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
|
|
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
|
|
}
|
|
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
|
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
|
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
|
|
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
|
|
fprintf(screen,"-------------------------------------");
|
|
fprintf(screen,"--------------------------------\n\n");
|
|
|
|
|
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
|
|
|
|
|
}
|
|
_max_bytes=0.0;
|
|
|
|
dev_error.clear();
|
|
lj1.clear();
|
|
lj3.clear();
|
|
sigma_epsilon.clear();
|
|
cut_form.clear();
|
|
|
|
shape.clear();
|
|
well.clear();
|
|
lshape.clear();
|
|
gamma_upsilon_mu.clear();
|
|
host_olist.clear();
|
|
|
|
time_kernel.clear();
|
|
time_gayberne.clear();
|
|
time_kernel2.clear();
|
|
time_gayberne2.clear();
|
|
time_pair.clear();
|
|
hd_balancer.clear();
|
|
|
|
if (_compiled) {
|
|
k_gb_nbor_fast.clear();
|
|
k_gb_nbor.clear();
|
|
k_gayberne.clear();
|
|
k_sphere_gb.clear();
|
|
k_lj_fast.clear();
|
|
k_lj.clear();
|
|
delete pair_program;
|
|
delete gb_program;
|
|
delete gb_lj_program;
|
|
_compiled=false;
|
|
}
|
|
|
|
nbor->clear();
|
|
ans->clear();
|
|
device->clear();
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
double GB_GPU_MemoryT::host_memory_usage() const {
|
|
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
|
4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
|
|
nbor->max_atoms()*sizeof(int);
|
|
}
|
|
|
|
template <class numtyp, class acctyp>
|
|
void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) {
|
|
if (_compiled)
|
|
return;
|
|
|
|
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
|
std::string(OCL_PRECISION_COMPILE);
|
|
|
|
pair_program=new UCL_Program(dev);
|
|
pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str());
|
|
k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast");
|
|
k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor");
|
|
|
|
gb_program=new UCL_Program(dev);
|
|
gb_program->load_string(gb_gpu_kernel,flags.c_str());
|
|
k_gayberne.set_function(*gb_program,"kernel_gayberne");
|
|
|
|
gb_lj_program=new UCL_Program(dev);
|
|
gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str());
|
|
k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb");
|
|
k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast");
|
|
k_lj.set_function(*gb_lj_program,"kernel_lj");
|
|
|
|
_compiled=true;
|
|
}
|
|
|
|
template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
|