forked from lijiext/lammps
866 lines
24 KiB
C++
866 lines
24 KiB
C++
/***************************************************************************
|
|
gb_gpu_kernel.cu
|
|
-------------------
|
|
W. Michael Brown
|
|
|
|
Routines that actually perform the force/torque computation
|
|
|
|
*** Force Decomposition by Atom Version ***
|
|
__________________________________________________________________________
|
|
This file is part of the LAMMPS GPU Library
|
|
__________________________________________________________________________
|
|
|
|
begin : Tue Jun 23 2009
|
|
copyright : (C) 2009 by W. Michael Brown
|
|
email : wmbrown@sandia.gov
|
|
***************************************************************************/
|
|
|
|
/* -----------------------------------------------------------------------
|
|
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
----------------------------------------------------------------------- */
|
|
|
|
#ifndef GB_GPU_KERNEL
|
|
#define GB_GPU_KERNEL
|
|
|
|
#include "gb_gpu_extra.h"
|
|
|
|
template <class numtyp>
|
|
static __inline__ __device__ void compute_eta_torque(numtyp m[9],
|
|
numtyp m2[9],
|
|
const int i,
|
|
numtyp ans[9])
|
|
{
|
|
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
|
|
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
|
|
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
|
|
den = (numtyp)1.0/den;
|
|
|
|
numtyp shapex=_shape_<numtyp>(i,0);
|
|
numtyp shapey=_shape_<numtyp>(i,1);
|
|
numtyp shapez=_shape_<numtyp>(i,2);
|
|
|
|
ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
|
|
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
|
|
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
|
|
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
|
|
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
|
|
|
|
ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
|
|
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
|
|
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
|
|
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
|
|
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
|
|
|
|
ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
|
|
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
|
|
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
|
|
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
|
|
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
|
|
|
|
ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
|
|
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
|
|
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
|
|
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
|
|
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
|
|
|
|
ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
|
|
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
|
|
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
|
|
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
|
|
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
|
|
|
|
ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
|
|
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
|
|
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
|
|
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
|
|
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
|
|
|
|
ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
|
|
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
|
|
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
|
|
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
|
|
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
|
|
|
|
ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
|
|
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
|
|
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
|
|
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
|
|
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
|
|
|
|
ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
|
|
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
|
|
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
|
|
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
|
|
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
|
}
|
|
|
|
template<class numtyp, class acctyp>
|
|
__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
|
|
const int *dev_nbor, const int nbor_pitch,
|
|
acctyp *ans, size_t ans_pitch, int *err_flag,
|
|
const bool eflag, const bool vflag,
|
|
const int inum, const int nall) {
|
|
|
|
__shared__ numtyp sp_lj[4];
|
|
|
|
// ii indexes the two interacting particles in gi
|
|
int ii=threadIdx.x;
|
|
if (ii<4)
|
|
sp_lj[ii]=special_lj[ii];
|
|
ii+=INT_MUL(blockIdx.x,blockDim.x);
|
|
__syncthreads();
|
|
|
|
if (ii<inum) {
|
|
|
|
acctyp energy=(numtyp)0;
|
|
acctyp fx=(numtyp)0;
|
|
acctyp fy=(numtyp)0;
|
|
acctyp fz=(numtyp)0;
|
|
acctyp torx=(numtyp)0;
|
|
acctyp tory=(numtyp)0;
|
|
acctyp torz=(numtyp)0;
|
|
acctyp virial[6];
|
|
for (int i=0; i<6; i++)
|
|
virial[i]=(numtyp)0;
|
|
|
|
const int *nbor=dev_nbor+ii;
|
|
int i=*nbor;
|
|
nbor+=nbor_pitch;
|
|
nbor+=nbor_pitch;
|
|
nbor+=nbor_pitch;
|
|
int numj=*nbor;
|
|
nbor+=nbor_pitch;
|
|
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
|
|
|
|
numtyp ix=_x_<numtyp>(i,0);
|
|
numtyp iy=_x_<numtyp>(i,1);
|
|
numtyp iz=_x_<numtyp>(i,2);
|
|
int itype=_x_<numtyp>(i,7);
|
|
numtyp a1[9], b1[9], g1[9];
|
|
{
|
|
numtyp t[9];
|
|
gpu_quat_to_mat_trans(i,a1);
|
|
gpu_shape_times3(itype,a1,t);
|
|
gpu_transpose_times3(a1,t,g1);
|
|
gpu_well_times3(itype,a1,t);
|
|
gpu_transpose_times3(a1,t,b1);
|
|
}
|
|
|
|
numtyp factor_lj;
|
|
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
|
|
|
int j=*nbor;
|
|
if (j < nall)
|
|
factor_lj = (numtyp)1.0;
|
|
else {
|
|
factor_lj = sp_lj[j/nall];
|
|
j %= nall;
|
|
}
|
|
int jtype=_x_<numtyp>(j,7);
|
|
|
|
// Compute r12
|
|
numtyp r12[3];
|
|
r12[0] = _x_<numtyp>(j,0)-ix;
|
|
r12[1] = _x_<numtyp>(j,1)-iy;
|
|
r12[2] = _x_<numtyp>(j,2)-iz;
|
|
numtyp ir = gpu_dot3(r12,r12);
|
|
|
|
ir = rsqrt(ir);
|
|
numtyp r = (numtyp)1.0/ir;
|
|
|
|
numtyp a2[9];
|
|
gpu_quat_to_mat_trans(j,a2);
|
|
|
|
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
|
{ // Compute U_r, dUr, eta, and teta
|
|
// Compute g12
|
|
numtyp g12[9];
|
|
{
|
|
numtyp g2[9];
|
|
{
|
|
gpu_shape_times3(jtype,a2,g12);
|
|
gpu_transpose_times3(a2,g12,g2);
|
|
gpu_plus3(g1,g2,g12);
|
|
}
|
|
|
|
{ // Compute U_r and dUr
|
|
|
|
// Compute kappa
|
|
numtyp kappa[3];
|
|
gpu_mldivide3(g12,r12,kappa,err_flag);
|
|
|
|
// -- replace r12 with r12 hat
|
|
r12[0]*=ir;
|
|
r12[1]*=ir;
|
|
r12[2]*=ir;
|
|
|
|
// -- kappa is now / r
|
|
kappa[0]*=ir;
|
|
kappa[1]*=ir;
|
|
kappa[2]*=ir;
|
|
|
|
// energy
|
|
|
|
// compute u_r and dUr
|
|
numtyp uslj_rsq;
|
|
{
|
|
// Compute distance of closest approach
|
|
numtyp h12, sigma12;
|
|
sigma12 = gpu_dot3(r12,kappa);
|
|
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
|
h12 = r-sigma12;
|
|
|
|
// -- kappa is now ok
|
|
kappa[0]*=r;
|
|
kappa[1]*=r;
|
|
kappa[2]*=r;
|
|
|
|
numtyp sigma = _sigma_<numtyp>(itype,jtype);
|
|
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
|
|
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
|
numtyp varrho6 = varrho*varrho*varrho;
|
|
varrho6*=varrho6;
|
|
numtyp varrho12 = varrho6*varrho6;
|
|
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
|
|
|
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
|
temp1 = temp1*(numtyp)24.0*epsilon;
|
|
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
|
numtyp temp2 = gpu_dot3(kappa,r12);
|
|
uslj_rsq = uslj_rsq*ir*ir;
|
|
|
|
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
|
|
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
|
|
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
|
|
}
|
|
|
|
// torque for particle 1
|
|
{
|
|
numtyp tempv[3], tempv2[3];
|
|
tempv[0] = -uslj_rsq*kappa[0];
|
|
tempv[1] = -uslj_rsq*kappa[1];
|
|
tempv[2] = -uslj_rsq*kappa[2];
|
|
gpu_row_times3(kappa,g1,tempv2);
|
|
gpu_cross3(tempv,tempv2,tUr);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Compute eta
|
|
{
|
|
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
|
|
numtyp det_g12 = gpu_det3(g12);
|
|
eta = pow(eta/det_g12,gum[1]);
|
|
}
|
|
|
|
// Compute teta
|
|
numtyp temp[9], tempv[3], tempv2[3];
|
|
compute_eta_torque(g12,a1,itype,temp);
|
|
numtyp temp1 = -eta*gum[1];
|
|
|
|
tempv[0] = temp1*temp[0];
|
|
tempv[1] = temp1*temp[1];
|
|
tempv[2] = temp1*temp[2];
|
|
gpu_cross3(a1,tempv,tempv2);
|
|
teta[0] = tempv2[0];
|
|
teta[1] = tempv2[1];
|
|
teta[2] = tempv2[2];
|
|
|
|
tempv[0] = temp1*temp[3];
|
|
tempv[1] = temp1*temp[4];
|
|
tempv[2] = temp1*temp[5];
|
|
gpu_cross3(a1+3,tempv,tempv2);
|
|
teta[0] += tempv2[0];
|
|
teta[1] += tempv2[1];
|
|
teta[2] += tempv2[2];
|
|
|
|
tempv[0] = temp1*temp[6];
|
|
tempv[1] = temp1*temp[7];
|
|
tempv[2] = temp1*temp[8];
|
|
gpu_cross3(a1+6,tempv,tempv2);
|
|
teta[0] += tempv2[0];
|
|
teta[1] += tempv2[1];
|
|
teta[2] += tempv2[2];
|
|
}
|
|
|
|
numtyp chi, dchi[3], tchi[3];
|
|
{ // Compute chi and dchi
|
|
|
|
// Compute b12
|
|
numtyp b2[9], b12[9];
|
|
{
|
|
gpu_well_times3(jtype,a2,b12);
|
|
gpu_transpose_times3(a2,b12,b2);
|
|
gpu_plus3(b1,b2,b12);
|
|
}
|
|
|
|
// compute chi_12
|
|
r12[0]*=r;
|
|
r12[1]*=r;
|
|
r12[2]*=r;
|
|
numtyp iota[3];
|
|
gpu_mldivide3(b12,r12,iota,err_flag);
|
|
// -- iota is now iota/r
|
|
iota[0]*=ir;
|
|
iota[1]*=ir;
|
|
iota[2]*=ir;
|
|
r12[0]*=ir;
|
|
r12[1]*=ir;
|
|
r12[2]*=ir;
|
|
chi = gpu_dot3(r12,iota);
|
|
chi = pow(chi*(numtyp)2.0,gum[2]);
|
|
|
|
// -- iota is now ok
|
|
iota[0]*=r;
|
|
iota[1]*=r;
|
|
iota[2]*=r;
|
|
|
|
numtyp temp1 = gpu_dot3(iota,r12);
|
|
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
|
|
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
|
|
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
|
|
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
|
|
|
|
// compute t_chi
|
|
numtyp tempv[3];
|
|
gpu_row_times3(iota,b1,tempv);
|
|
gpu_cross3(tempv,iota,tchi);
|
|
temp1 = (numtyp)-4.0*ir*ir;
|
|
tchi[0] *= temp1;
|
|
tchi[1] *= temp1;
|
|
tchi[2] *= temp1;
|
|
}
|
|
|
|
numtyp temp2 = factor_lj*eta*chi;
|
|
if (eflag)
|
|
energy+=u_r*temp2;
|
|
numtyp temp1 = -eta*u_r*factor_lj;
|
|
if (vflag) {
|
|
r12[0]*=-r;
|
|
r12[1]*=-r;
|
|
r12[2]*=-r;
|
|
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
|
fx+=ft;
|
|
virial[0]+=r12[0]*ft;
|
|
ft=temp1*dchi[1]-temp2*dUr[1];
|
|
fy+=ft;
|
|
virial[1]+=r12[1]*ft;
|
|
virial[3]+=r12[0]*ft;
|
|
ft=temp1*dchi[2]-temp2*dUr[2];
|
|
fz+=ft;
|
|
virial[2]+=r12[2]*ft;
|
|
virial[4]+=r12[0]*ft;
|
|
virial[5]+=r12[1]*ft;
|
|
} else {
|
|
fx+=temp1*dchi[0]-temp2*dUr[0];
|
|
fy+=temp1*dchi[1]-temp2*dUr[1];
|
|
fz+=temp1*dchi[2]-temp2*dUr[2];
|
|
}
|
|
|
|
// Torque on 1
|
|
temp1 = -u_r*eta*factor_lj;
|
|
temp2 = -u_r*chi*factor_lj;
|
|
numtyp temp3 = -chi*eta*factor_lj;
|
|
torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
|
|
tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
|
|
torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
|
|
|
} // for nbor
|
|
|
|
// Store answers
|
|
acctyp *ap1=ans+ii;
|
|
if (eflag) {
|
|
*ap1=energy;
|
|
ap1+=ans_pitch;
|
|
}
|
|
if (vflag) {
|
|
for (int i=0; i<6; i++) {
|
|
*ap1=virial[i];
|
|
ap1+=ans_pitch;
|
|
}
|
|
}
|
|
*ap1=fx;
|
|
ap1+=ans_pitch;
|
|
*ap1=fy;
|
|
ap1+=ans_pitch;
|
|
*ap1=fz;
|
|
ap1+=ans_pitch;
|
|
*ap1=torx;
|
|
ap1+=ans_pitch;
|
|
*ap1=tory;
|
|
ap1+=ans_pitch;
|
|
*ap1=torz;
|
|
|
|
} // if ii
|
|
}
|
|
|
|
template<class numtyp, class acctyp>
|
|
__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
|
|
const int *dev_nbor, const int nbor_pitch,
|
|
acctyp *ans, size_t ans_pitch, int *err_flag,
|
|
const bool eflag, const bool vflag,
|
|
const int start, const int inum,
|
|
const int nall) {
|
|
__shared__ numtyp sp_lj[4];
|
|
|
|
// ii indexes the two interacting particles in gi
|
|
int ii=threadIdx.x;
|
|
if (ii<4)
|
|
sp_lj[ii]=special_lj[ii];
|
|
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
|
__syncthreads();
|
|
|
|
if (ii<inum) {
|
|
|
|
acctyp energy=(numtyp)0;
|
|
acctyp fx=(numtyp)0;
|
|
acctyp fy=(numtyp)0;
|
|
acctyp fz=(numtyp)0;
|
|
acctyp virial[6];
|
|
for (int i=0; i<6; i++)
|
|
virial[i]=(numtyp)0;
|
|
|
|
const int *nbor=dev_nbor+ii;
|
|
int i=*nbor;
|
|
nbor+=nbor_pitch;
|
|
nbor+=nbor_pitch;
|
|
nbor+=nbor_pitch;
|
|
int numj=*nbor;
|
|
nbor+=nbor_pitch;
|
|
const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
|
|
|
|
numtyp ix=_x_<numtyp>(i,0);
|
|
numtyp iy=_x_<numtyp>(i,1);
|
|
numtyp iz=_x_<numtyp>(i,2);
|
|
int itype=_x_<numtyp>(i,7);
|
|
|
|
numtyp oner=_shape_<numtyp>(itype,0);
|
|
numtyp one_well=_well_<numtyp>(itype,0);
|
|
|
|
numtyp factor_lj;
|
|
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
|
|
|
int j=*nbor;
|
|
if (j < nall)
|
|
factor_lj = (numtyp)1.0;
|
|
else {
|
|
factor_lj = sp_lj[j/nall];
|
|
j %= nall;
|
|
}
|
|
int jtype=_x_<numtyp>(j,7);
|
|
|
|
// Compute r12
|
|
numtyp r12[3];
|
|
r12[0] = _x_<numtyp>(j,0)-ix;
|
|
r12[1] = _x_<numtyp>(j,1)-iy;
|
|
r12[2] = _x_<numtyp>(j,2)-iz;
|
|
numtyp ir = gpu_dot3(r12,r12);
|
|
|
|
ir = rsqrt(ir);
|
|
numtyp r = (numtyp)1.0/ir;
|
|
|
|
numtyp r12hat[3];
|
|
r12hat[0]=r12[0]*ir;
|
|
r12hat[1]=r12[1]*ir;
|
|
r12hat[2]=r12[2]*ir;
|
|
|
|
numtyp a2[9];
|
|
gpu_quat_to_mat_trans(j,a2);
|
|
|
|
numtyp u_r, dUr[3], eta;
|
|
{ // Compute U_r, dUr, eta, and teta
|
|
// Compute g12
|
|
numtyp g12[9];
|
|
{
|
|
{
|
|
numtyp g2[9];
|
|
gpu_shape_times3(jtype,a2,g12);
|
|
gpu_transpose_times3(a2,g12,g2);
|
|
g12[0]=g2[0]+oner;
|
|
g12[4]=g2[4]+oner;
|
|
g12[8]=g2[8]+oner;
|
|
g12[1]=g2[1];
|
|
g12[2]=g2[2];
|
|
g12[3]=g2[3];
|
|
g12[5]=g2[5];
|
|
g12[6]=g2[6];
|
|
g12[7]=g2[7];
|
|
}
|
|
|
|
{ // Compute U_r and dUr
|
|
|
|
// Compute kappa
|
|
numtyp kappa[3];
|
|
gpu_mldivide3(g12,r12,kappa,err_flag);
|
|
|
|
// -- kappa is now / r
|
|
kappa[0]*=ir;
|
|
kappa[1]*=ir;
|
|
kappa[2]*=ir;
|
|
|
|
// energy
|
|
|
|
// compute u_r and dUr
|
|
numtyp uslj_rsq;
|
|
{
|
|
// Compute distance of closest approach
|
|
numtyp h12, sigma12;
|
|
sigma12 = gpu_dot3(r12hat,kappa);
|
|
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
|
h12 = r-sigma12;
|
|
|
|
// -- kappa is now ok
|
|
kappa[0]*=r;
|
|
kappa[1]*=r;
|
|
kappa[2]*=r;
|
|
|
|
numtyp sigma = _sigma_<numtyp>(itype,jtype);
|
|
numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
|
|
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
|
numtyp varrho6 = varrho*varrho*varrho;
|
|
varrho6*=varrho6;
|
|
numtyp varrho12 = varrho6*varrho6;
|
|
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
|
|
|
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
|
temp1 = temp1*(numtyp)24.0*epsilon;
|
|
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
|
numtyp temp2 = gpu_dot3(kappa,r12hat);
|
|
uslj_rsq = uslj_rsq*ir*ir;
|
|
|
|
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
|
|
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
|
|
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Compute eta
|
|
{
|
|
eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
|
|
numtyp det_g12 = gpu_det3(g12);
|
|
eta = pow(eta/det_g12,gum[1]);
|
|
}
|
|
}
|
|
|
|
numtyp chi, dchi[3];
|
|
{ // Compute chi and dchi
|
|
|
|
// Compute b12
|
|
numtyp b12[9];
|
|
{
|
|
numtyp b2[9];
|
|
gpu_well_times3(jtype,a2,b12);
|
|
gpu_transpose_times3(a2,b12,b2);
|
|
b12[0]=b2[0]+one_well;
|
|
b12[4]=b2[4]+one_well;
|
|
b12[8]=b2[8]+one_well;
|
|
b12[1]=b2[1];
|
|
b12[2]=b2[2];
|
|
b12[3]=b2[3];
|
|
b12[5]=b2[5];
|
|
b12[6]=b2[6];
|
|
b12[7]=b2[7];
|
|
}
|
|
|
|
// compute chi_12
|
|
numtyp iota[3];
|
|
gpu_mldivide3(b12,r12,iota,err_flag);
|
|
// -- iota is now iota/r
|
|
iota[0]*=ir;
|
|
iota[1]*=ir;
|
|
iota[2]*=ir;
|
|
chi = gpu_dot3(r12hat,iota);
|
|
chi = pow(chi*(numtyp)2.0,gum[2]);
|
|
|
|
// -- iota is now ok
|
|
iota[0]*=r;
|
|
iota[1]*=r;
|
|
iota[2]*=r;
|
|
|
|
numtyp temp1 = gpu_dot3(iota,r12hat);
|
|
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
|
|
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
|
|
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
|
|
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
|
|
}
|
|
|
|
numtyp temp2 = factor_lj*eta*chi;
|
|
if (eflag)
|
|
energy+=u_r*temp2;
|
|
numtyp temp1 = -eta*u_r*factor_lj;
|
|
if (vflag) {
|
|
r12[0]*=-1;
|
|
r12[1]*=-1;
|
|
r12[2]*=-1;
|
|
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
|
fx+=ft;
|
|
virial[0]+=r12[0]*ft;
|
|
ft=temp1*dchi[1]-temp2*dUr[1];
|
|
fy+=ft;
|
|
virial[1]+=r12[1]*ft;
|
|
virial[3]+=r12[0]*ft;
|
|
ft=temp1*dchi[2]-temp2*dUr[2];
|
|
fz+=ft;
|
|
virial[2]+=r12[2]*ft;
|
|
virial[4]+=r12[0]*ft;
|
|
virial[5]+=r12[1]*ft;
|
|
} else {
|
|
fx+=temp1*dchi[0]-temp2*dUr[0];
|
|
fy+=temp1*dchi[1]-temp2*dUr[1];
|
|
fz+=temp1*dchi[2]-temp2*dUr[2];
|
|
}
|
|
} // for nbor
|
|
|
|
// Store answers
|
|
acctyp *ap1=ans+ii;
|
|
if (eflag) {
|
|
*ap1=energy;
|
|
ap1+=ans_pitch;
|
|
}
|
|
if (vflag) {
|
|
for (int i=0; i<6; i++) {
|
|
*ap1=virial[i];
|
|
ap1+=ans_pitch;
|
|
}
|
|
}
|
|
*ap1=fx;
|
|
ap1+=ans_pitch;
|
|
*ap1=fy;
|
|
ap1+=ans_pitch;
|
|
*ap1=fz;
|
|
} // if ii
|
|
}
|
|
|
|
template<class numtyp, class acctyp>
|
|
__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
|
|
const int *dev_ij, const int nbor_pitch, acctyp *ans,
|
|
size_t ans_pitch, int *err_flag, const bool eflag,
|
|
const bool vflag, const int start, const int inum,
|
|
const int nall) {
|
|
__shared__ numtyp sp_lj[4];
|
|
|
|
// ii indexes the two interacting particles in gi
|
|
int ii=threadIdx.x;
|
|
if (ii<4)
|
|
sp_lj[ii]=special_lj[ii];
|
|
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
|
__syncthreads();
|
|
|
|
if (ii<inum) {
|
|
|
|
acctyp energy=(numtyp)0;
|
|
acctyp fx=(numtyp)0;
|
|
acctyp fy=(numtyp)0;
|
|
acctyp fz=(numtyp)0;
|
|
acctyp virial[6];
|
|
for (int i=0; i<6; i++)
|
|
virial[i]=(numtyp)0;
|
|
|
|
const int *nbor=dev_nbor+ii;
|
|
int i=*nbor;
|
|
nbor+=nbor_pitch;
|
|
int numj=*nbor;
|
|
nbor+=nbor_pitch;
|
|
const int *list=dev_ij+*nbor;
|
|
const int *list_end=list+numj;
|
|
|
|
numtyp ix=_x_<numtyp>(i,0);
|
|
numtyp iy=_x_<numtyp>(i,1);
|
|
numtyp iz=_x_<numtyp>(i,2);
|
|
int itype=_x_<numtyp>(i,7);
|
|
|
|
numtyp factor_lj;
|
|
for ( ; list<list_end; list++) {
|
|
|
|
int j=*list;
|
|
if (j < nall)
|
|
factor_lj = (numtyp)1.0;
|
|
else {
|
|
factor_lj = sp_lj[j/nall];
|
|
j %= nall;
|
|
}
|
|
int jtype=_x_<numtyp>(j,7);
|
|
|
|
// Compute r12
|
|
numtyp delx = ix-_x_<numtyp>(j,0);
|
|
numtyp dely = iy-_x_<numtyp>(j,1);
|
|
numtyp delz = iz-_x_<numtyp>(j,2);
|
|
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
|
|
|
if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
|
|
_form_(itype,jtype)==SPHERE_SPHERE) {
|
|
r2inv=(numtyp)1.0/r2inv;
|
|
numtyp r6inv = r2inv*r2inv*r2inv;
|
|
numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
|
|
_lj1_<numtyp>(itype,jtype).y);
|
|
force*=factor_lj;
|
|
|
|
fx+=delx*force;
|
|
fy+=dely*force;
|
|
fz+=delz*force;
|
|
|
|
if (eflag) {
|
|
numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
|
|
_lj3_<numtyp>(itype,jtype).y);
|
|
energy+=factor_lj*(e-_offset_<numtyp>(1,1));
|
|
}
|
|
if (vflag) {
|
|
virial[0] += delx*delx*force;
|
|
virial[1] += dely*dely*force;
|
|
virial[2] += delz*delz*force;
|
|
virial[3] += delx*dely*force;
|
|
virial[4] += delx*delz*force;
|
|
virial[5] += dely*delz*force;
|
|
}
|
|
}
|
|
|
|
} // for nbor
|
|
|
|
// Store answers
|
|
acctyp *ap1=ans+ii;
|
|
if (eflag) {
|
|
*ap1+=energy;
|
|
ap1+=ans_pitch;
|
|
}
|
|
if (vflag) {
|
|
for (int i=0; i<6; i++) {
|
|
*ap1+=virial[i];
|
|
ap1+=ans_pitch;
|
|
}
|
|
}
|
|
*ap1+=fx;
|
|
ap1+=ans_pitch;
|
|
*ap1+=fy;
|
|
ap1+=ans_pitch;
|
|
*ap1+=fz;
|
|
|
|
} // if ii
|
|
}
|
|
|
|
template<class numtyp, class acctyp>
|
|
__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
|
|
const int *dev_ij, const int nbor_pitch,
|
|
acctyp *ans, size_t ans_pitch,int *err_flag,
|
|
const bool eflag, const bool vflag,
|
|
const int start, const int inum, const int nall){
|
|
// ii indexes the two interacting particles in gi
|
|
int ii=threadIdx.x;
|
|
__shared__ numtyp sp_lj[4];
|
|
__shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
__shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
__shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
__shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
__shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
__shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
__shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
|
if (ii<4)
|
|
sp_lj[ii]=special_lj[ii];
|
|
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
|
int itype=ii/MAX_SHARED_TYPES;
|
|
int jtype=ii%MAX_SHARED_TYPES;
|
|
cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
|
|
form[ii]=_form_(itype,jtype);
|
|
lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
|
|
lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
|
|
if (eflag) {
|
|
lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
|
|
lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
|
|
offset[ii]=_offset_<numtyp>(itype,jtype);
|
|
}
|
|
}
|
|
ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
|
|
__syncthreads();
|
|
|
|
if (ii<inum) {
|
|
|
|
acctyp energy=(numtyp)0;
|
|
acctyp fx=(numtyp)0;
|
|
acctyp fy=(numtyp)0;
|
|
acctyp fz=(numtyp)0;
|
|
acctyp virial[6];
|
|
for (int i=0; i<6; i++)
|
|
virial[i]=(numtyp)0;
|
|
|
|
const int *nbor=dev_nbor+ii;
|
|
int i=*nbor;
|
|
nbor+=nbor_pitch;
|
|
int numj=*nbor;
|
|
nbor+=nbor_pitch;
|
|
const int *list=dev_ij+*nbor;
|
|
const int *list_end=list+numj;
|
|
|
|
numtyp ix=_x_<numtyp>(i,0);
|
|
numtyp iy=_x_<numtyp>(i,1);
|
|
numtyp iz=_x_<numtyp>(i,2);
|
|
int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
|
|
|
|
numtyp factor_lj;
|
|
for ( ; list<list_end; list++) {
|
|
|
|
int j=*list;
|
|
if (j < nall)
|
|
factor_lj = (numtyp)1.0;
|
|
else {
|
|
factor_lj = sp_lj[j/nall];
|
|
j %= nall;
|
|
}
|
|
int mtype=itype+_x_<numtyp>(j,7);
|
|
|
|
// Compute r12
|
|
numtyp delx = ix-_x_<numtyp>(j,0);
|
|
numtyp dely = iy-_x_<numtyp>(j,1);
|
|
numtyp delz = iz-_x_<numtyp>(j,2);
|
|
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
|
|
|
if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
|
|
r2inv=(numtyp)1.0/r2inv;
|
|
numtyp r6inv = r2inv*r2inv*r2inv;
|
|
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
|
|
|
|
fx+=delx*force;
|
|
fy+=dely*force;
|
|
fz+=delz*force;
|
|
|
|
if (eflag) {
|
|
numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
|
|
energy+=factor_lj*(e-offset[mtype]);
|
|
}
|
|
if (vflag) {
|
|
virial[0] += delx*delx*force;
|
|
virial[1] += dely*dely*force;
|
|
virial[2] += delz*delz*force;
|
|
virial[3] += delx*dely*force;
|
|
virial[4] += delx*delz*force;
|
|
virial[5] += dely*delz*force;
|
|
}
|
|
}
|
|
|
|
} // for nbor
|
|
|
|
// Store answers
|
|
acctyp *ap1=ans+ii;
|
|
if (eflag) {
|
|
*ap1+=energy;
|
|
ap1+=ans_pitch;
|
|
}
|
|
if (vflag) {
|
|
for (int i=0; i<6; i++) {
|
|
*ap1+=virial[i];
|
|
ap1+=ans_pitch;
|
|
}
|
|
}
|
|
*ap1+=fx;
|
|
ap1+=ans_pitch;
|
|
*ap1+=fy;
|
|
ap1+=ans_pitch;
|
|
*ap1+=fz;
|
|
|
|
} // if ii
|
|
}
|
|
|
|
#endif
|