lammps/lib/gpu/lal_pppm.cpp

412 lines
13 KiB
C++
Raw Normal View History

/***************************************************************************
pppm.cpp
-------------------
W. Michael Brown (ORNL)
Class for PPPM acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "pppm_cl.h"
#else
#include "pppm_f_ptx.h"
#include "pppm_d_ptx.h"
#endif
#include "lal_pppm.h"
#include <cassert>
using namespace LAMMPS_AL;
#define PPPMT PPPM<numtyp, acctyp, grdtyp, grdtyp4>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
PPPMT::PPPM() : _allocated(false), _compiled(false),
_max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
PPPMT::~PPPM() {
clear(0.0);
delete ans;
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
int PPPMT::bytes_per_atom() const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
const int order, const int nxlo_out,
const int nylo_out, const int nzlo_out,
const int nxhi_out, const int nyhi_out,
const int nzhi_out, grdtyp **rho_coeff,
grdtyp **vd_brick, const double slab_volfactor,
const int nx_pppm, const int ny_pppm,
const int nz_pppm, const bool split, int &flag) {
_max_bytes=10;
screen=_screen;
_kspace_split=split;
bool success=true;
flag=device->init(*ans,nlocal,nall);
if (flag!=0)
return 0;
if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
flag=-5;
return 0;
}
if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
flag=-4;
return 0;
}
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pppm_block();
_pencil_size=device->num_mem_threads();
_block_pencils=_block_size/_pencil_size;
compile_kernels(*ucl_device);
// Initialize timers for the selected GPU
time_in.init(*ucl_device);
time_in.zero();
time_out.init(*ucl_device);
time_out.zero();
time_map.init(*ucl_device);
time_map.zero();
time_rho.init(*ucl_device);
time_rho.zero();
time_interp.init(*ucl_device);
time_interp.zero();
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
_allocated=true;
_max_bytes=0;
_max_an_bytes=ans->gpu_bytes();
_order=order;
_order_m_1=order-1;
_order2=_order_m_1*_order;
_nlower=-(_order-1)/2;
_nupper=order/2;
_nxlo_out=nxlo_out;
_nylo_out=nylo_out;
_nzlo_out=nzlo_out;
_nxhi_out=nxhi_out;
_nyhi_out=nyhi_out;
_nzhi_out=nzhi_out;
_slab_volfactor=slab_volfactor;
_nx_pppm=nx_pppm;
_ny_pppm=ny_pppm;
_nz_pppm=nz_pppm;
_max_brick_atoms=10;
// Get rho_coeff on device
int n2lo=(1-order)/2;
int numel=order*( order/2 - n2lo + 1 );
success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
UCL_SUCCESS);
UCL_H_Vec<grdtyp> view;
view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
ucl_copy(d_rho_coeff,view,true);
_max_bytes+=d_rho_coeff.row_bytes();
// Allocate storage for grid
_npts_x=nxhi_out-nxlo_out+1;
_npts_y=nyhi_out-nylo_out+1;
_npts_z=nzhi_out-nzlo_out+1;
_npts_yx=_npts_x*_npts_y;
success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
UCL_SUCCESS);
success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
UCL_SUCCESS);
success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
UCL_SUCCESS);
*vd_brick=h_vd_brick.begin();
_max_bytes+=d_brick.row_bytes();
// Allocate vector with count of atoms assigned to each grid point
_nlocal_x=_npts_x+_nlower-_nupper;
_nlocal_y=_npts_y+_nlower-_nupper;
_nlocal_z=_npts_z+_nlower-_nupper;
_nlocal_yx=_nlocal_x*_nlocal_y;
_atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
UCL_SUCCESS);
_max_bytes+=d_brick_counts.row_bytes();
// Allocate storage for atoms assigned to each grid point
success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
*ucl_device)==UCL_SUCCESS);
_max_bytes+=d_brick_atoms.row_bytes();
// Allocate error flags for checking out of bounds atoms
success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
UCL_SUCCESS);
if (!success) {
flag=-3;
return 0;
}
d_error_flag.zero();
_max_bytes+=1;
_cpu_idle_time=0.0;
return h_brick.begin();
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::clear(const double cpu_time) {
if (!_allocated)
return;
_allocated=false;
_precompute_done=false;
d_brick.clear();
h_brick.clear();
h_vd_brick.clear();
d_brick_counts.clear();
h_error_flag.clear();
d_error_flag.clear();
d_brick_atoms.clear();
acc_timers();
device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
*ans,_max_bytes+_max_an_bytes,cpu_time,
_cpu_idle_time,screen);
if (_compiled) {
k_particle_map.clear();
k_make_rho.clear();
k_interp.clear();
delete pppm_program;
_compiled=false;
}
time_in.clear();
time_out.clear();
time_map.clear();
time_rho.clear();
time_interp.clear();
ans->clear();
device->clear();
}
// ---------------------------------------------------------------------------
// Charge assignment that can be performed asynchronously
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *host_q, double *boxlo,
const double delxinv, const double delyinv,
const double delzinv) {
acc_timers();
if (nlocal==0) {
zero_timers();
return;
}
ans->inum(nlocal);
if (ago==0) {
resize_atom(nlocal,nall,success);
resize_local(nlocal,success);
if (!success)
return;
double bytes=ans->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
atom->add_x_data(host_x,host_type);
atom->add_q_data();
time_map.start();
// Compute the block size and grid size to keep all cores busy
int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->ans->inum();
// Boxlo adjusted to be upper left brick and shift for even spline order
double shift=0.0;
if (_order % 2)
shift=0.5;
_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
_delxinv=delxinv;
_delyinv=delyinv;
_delzinv=delzinv;
double delvolinv = delxinv*delyinv*delzinv;
grdtyp f_delvolinv = delvolinv;
device->zero(d_brick_counts,d_brick_counts.numel());
k_particle_map.set_size(GX,BX);
k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
&ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv,
&_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z,
&_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
time_map.stop();
time_rho.start();
BX=block_size();
GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
_block_pencils));
k_make_rho.set_size(GX,BX);
k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
&d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride,
&_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
&_nlocal_z, &_order_m_1, &_order, &_order2);
time_rho.stop();
time_out.start();
ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
ucl_copy(h_error_flag,d_error_flag,true);
time_out.stop();
_precompute_done=true;
}
// ---------------------------------------------------------------------------
// Charge spreading stuff
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
int PPPMT::spread(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *host_q, double *boxlo,
const double delxinv, const double delyinv,
const double delzinv) {
if (_precompute_done==false) {
atom->acc_timers();
_precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
delyinv,delzinv);
}
device->stop_host_timer();
if (!success || nlocal==0)
return 0;
double t=MPI_Wtime();
time_out.sync_stop();
_cpu_idle_time+=MPI_Wtime()-t;
_precompute_done=false;
if (h_error_flag[0]==2) {
// Not enough storage for atoms on the brick
_max_brick_atoms*=2;
d_error_flag.zero();
d_brick_atoms.clear();
d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
_max_bytes+=d_brick_atoms.row_bytes();
return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
delxinv,delyinv,delzinv);
}
return h_error_flag[0];
}
// ---------------------------------------------------------------------------
// Charge spreading stuff
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::interp(const grdtyp qqrd2e_scale) {
time_in.start();
ucl_copy(d_brick,h_vd_brick,true);
time_in.stop();
time_interp.start();
// Compute the block size and grid size to keep all cores busy
int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->ans->inum();
k_interp.set_size(GX,BX);
k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
&d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
&_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
time_interp.stop();
ans->copy_answers(false,false,false,false);
if (_kspace_split==false)
device->add_ans_object(ans);
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
double PPPMT::host_memory_usage() const {
return device->atom.host_memory_usage()+
sizeof(PPPM<numtyp,acctyp,grdtyp,grdtyp4>);
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::compile_kernels(UCL_Device &dev) {
if (_compiled)
return;
if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
#ifdef USE_OPENCL
flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
ucl_template_name<grdtyp>()+"4";
#endif
pppm_program=new UCL_Program(dev);
#ifdef USE_OPENCL
pppm_program->load_string(pppm,flags.c_str());
#else
if (sizeof(grdtyp)==sizeof(float))
pppm_program->load_string(pppm_f,flags.c_str());
else
pppm_program->load_string(pppm_d,flags.c_str());
#endif
k_particle_map.set_function(*pppm_program,"particle_map");
k_make_rho.set_function(*pppm_program,"make_rho");
k_interp.set_function(*pppm_program,"interp");
pos_tex.get_texture(*pppm_program,"pos_tex");
q_tex.get_texture(*pppm_program,"q_tex");
_compiled=true;
}
template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;