forked from lijiext/lammps
235 lines
8.7 KiB
Plaintext
235 lines
8.7 KiB
Plaintext
|
/***************************************************************************
|
||
|
lj_gpu.cu
|
||
|
-------------------
|
||
|
W. Michael Brown
|
||
|
|
||
|
Lennard-Jones potential GPU calcultation
|
||
|
|
||
|
__________________________________________________________________________
|
||
|
This file is part of the LAMMPS GPU Library
|
||
|
__________________________________________________________________________
|
||
|
|
||
|
begin : Tue Aug 4 2009
|
||
|
copyright : (C) 2009 by W. Michael Brown
|
||
|
email : wmbrown@sandia.gov
|
||
|
***************************************************************************/
|
||
|
|
||
|
/* -----------------------------------------------------------------------
|
||
|
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
||
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||
|
certain rights in this software. This software is distributed under
|
||
|
the GNU General Public License.
|
||
|
----------------------------------------------------------------------- */
|
||
|
|
||
|
#include <iostream>
|
||
|
#include <cassert>
|
||
|
#include "nvc_macros.h"
|
||
|
#include "nvc_timer.h"
|
||
|
#include "nvc_device.h"
|
||
|
#include "lj_gpu_memory.h"
|
||
|
#include "lj_gpu_kernel.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
|
||
|
#define LJMT LJ_GPU_Memory<numtyp,acctyp>
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Convert something to a string
|
||
|
// ---------------------------------------------------------------------------
|
||
|
#include <sstream>
|
||
|
template <class t>
|
||
|
inline string lj_gpu_toa(const t& in) {
|
||
|
ostringstream o;
|
||
|
o.precision(2);
|
||
|
o << in;
|
||
|
return o.str();
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Return string with GPU info
|
||
|
// ---------------------------------------------------------------------------
|
||
|
string lj_gpu_name(const int id, const int max_nbors) {
|
||
|
string name=LJMF.gpu.name(id)+", "+
|
||
|
lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
|
||
|
lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
|
||
|
lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
|
||
|
lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
|
||
|
max_nbors))+" Atoms";
|
||
|
return name;
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Allocate memory on host and device and copy constants to device
|
||
|
// ---------------------------------------------------------------------------
|
||
|
int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma,
|
||
|
double **epsilon, double **host_lj1, double **host_lj2,
|
||
|
double **host_lj3, double **host_lj4, double **offset,
|
||
|
double *special_lj, const int max_nbors, const int gpu_id) {
|
||
|
if (LJMF.gpu.num_devices()==0)
|
||
|
return 0;
|
||
|
|
||
|
ij_size=IJ_SIZE;
|
||
|
return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2,
|
||
|
host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Clear memory on host and device
|
||
|
// ---------------------------------------------------------------------------
|
||
|
void lj_gpu_clear() {
|
||
|
LJMF.clear();
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// copy atom positions and optionally types to device
|
||
|
// ---------------------------------------------------------------------------
|
||
|
template <class numtyp, class acctyp>
|
||
|
inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
|
||
|
const int *host_type, const bool rebuild,
|
||
|
cudaStream_t &stream) {
|
||
|
atom.time_atom.start();
|
||
|
atom.reset_write_buffer();
|
||
|
|
||
|
// First row of dev_x is x position, second is y, third is z
|
||
|
atom.add_atom_data(host_x[0],3);
|
||
|
atom.add_atom_data(host_x[0]+1,3);
|
||
|
atom.add_atom_data(host_x[0]+2,3);
|
||
|
|
||
|
int csize=3;
|
||
|
|
||
|
// If a rebuild occured, copy type data
|
||
|
if (rebuild) {
|
||
|
atom.add_atom_data(host_type);
|
||
|
csize++;
|
||
|
}
|
||
|
|
||
|
atom.copy_atom_data(csize,stream);
|
||
|
atom.time_atom.stop();
|
||
|
}
|
||
|
|
||
|
void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
|
||
|
_lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Signal that we need to transfer a new neighbor list
|
||
|
// ---------------------------------------------------------------------------
|
||
|
template <class LJMTyp>
|
||
|
bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum,
|
||
|
int *ilist, const int *numj) {
|
||
|
if (nall>ljm.max_atoms)
|
||
|
return false;
|
||
|
|
||
|
ljm.nbor.time_nbor.start();
|
||
|
|
||
|
ljm.atom.nall(nall);
|
||
|
ljm.atom.inum(inum);
|
||
|
ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
|
||
|
|
||
|
ljm.nbor.time_nbor.stop();
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
|
||
|
const int *numj) {
|
||
|
return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Copy a set of ij_size ij interactions to device and compute energies,
|
||
|
// forces, and torques for those interactions
|
||
|
// ---------------------------------------------------------------------------
|
||
|
template <class LJMTyp>
|
||
|
void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
|
||
|
ljm.nbor.time_nbor.add_to_total();
|
||
|
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
|
||
|
|
||
|
ljm.nbor.time_nbor.start();
|
||
|
ljm.nbor.add(num_ij,ljm.pair_stream);
|
||
|
ljm.nbor.time_nbor.stop();
|
||
|
}
|
||
|
|
||
|
void lj_gpu_nbors(const int num_ij) {
|
||
|
_lj_gpu_nbors(LJMF,num_ij);
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Calculate energies and forces for all ij interactions
|
||
|
// ---------------------------------------------------------------------------
|
||
|
template <class numtyp, class acctyp>
|
||
|
void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
|
||
|
// Compute the block size and grid size to keep all cores busy
|
||
|
const int BX=BLOCK_1D;
|
||
|
|
||
|
int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
|
||
|
|
||
|
ljm.time_pair.start();
|
||
|
if (ljm.shared_types)
|
||
|
kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
|
||
|
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
|
||
|
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
|
||
|
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
|
||
|
vflag, ljm.atom.inum(), ljm.atom.nall());
|
||
|
else
|
||
|
kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
|
||
|
(ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(),
|
||
|
ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(),
|
||
|
ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
|
||
|
vflag, ljm.atom.inum(), ljm.atom.nall());
|
||
|
ljm.time_pair.stop();
|
||
|
}
|
||
|
|
||
|
void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
|
||
|
_lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
|
||
|
}
|
||
|
|
||
|
// ---------------------------------------------------------------------------
|
||
|
// Get energies and forces to host
|
||
|
// ---------------------------------------------------------------------------
|
||
|
template<class numtyp, class acctyp>
|
||
|
double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
|
||
|
const bool eflag, const bool vflag, const bool eflag_atom,
|
||
|
const bool vflag_atom, double *eatom, double **vatom,
|
||
|
double *virial) {
|
||
|
double evdw;
|
||
|
|
||
|
ljm.atom.time_answer.start();
|
||
|
ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
|
||
|
|
||
|
ljm.atom.time_atom.add_to_total();
|
||
|
ljm.nbor.time_nbor.add_to_total();
|
||
|
ljm.time_pair.add_to_total();
|
||
|
// CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
|
||
|
|
||
|
evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||
|
ljm.atom.add_forces(ilist,f);
|
||
|
ljm.atom.time_answer.stop();
|
||
|
ljm.atom.time_answer.add_to_total();
|
||
|
return evdw;
|
||
|
}
|
||
|
|
||
|
double lj_gpu_forces(double **f, const int *ilist, const bool eflag,
|
||
|
const bool vflag, const bool eflag_atom,
|
||
|
const bool vflag_atom, double *eatom, double **vatom,
|
||
|
double *virial) {
|
||
|
return _lj_gpu_forces<PRECISION,ACC_PRECISION>
|
||
|
(LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
|
||
|
}
|
||
|
|
||
|
void lj_gpu_time() {
|
||
|
cout.precision(4);
|
||
|
cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n";
|
||
|
cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
|
||
|
cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n";
|
||
|
cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n";
|
||
|
}
|
||
|
|
||
|
int lj_gpu_num_devices() {
|
||
|
return LJMF.gpu.num_devices();
|
||
|
}
|
||
|
|
||
|
double lj_gpu_bytes() {
|
||
|
return LJMF.host_memory_usage();
|
||
|
}
|