forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6407 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
9418bacff7
commit
945343265e
|
@ -0,0 +1,531 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef CMMM_GPU_KERNEL
|
||||
#define CMMM_GPU_KERNEL
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
__inline double fetch_q(const int& i, const double *q)
|
||||
{
|
||||
return q[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
__inline float fetch_q(const int& i, const float *q)
|
||||
{
|
||||
return tex1Dfetch(q_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum,
|
||||
const int nbor_pitch, __global numtyp *q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const int smooth, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
sp_lj[4]=sp_lj_in[4];
|
||||
sp_lj[5]=sp_lj_in[5];
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
__local numtyp _ia;
|
||||
__local numtyp _ia2;
|
||||
__local numtyp _ia3;
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
_ia=(numtyp)-1.0/sqrt(cut_coulsq);
|
||||
_ia2=(numtyp)-1.0/cut_coulsq;
|
||||
_ia3=_ia2*_ia;
|
||||
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<lj1[mtype].x) {
|
||||
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
|
||||
numtyp r2inv=(numtyp)1.0/rsq;
|
||||
|
||||
if (rsq < lj1[mtype].y) {
|
||||
if (lj3[mtype].x == (numtyp)2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
} else if (lj3[mtype].x == (numtyp)1) {
|
||||
inv2=r2inv*sqrt(r2inv);
|
||||
inv1=inv2*inv2;
|
||||
} else {
|
||||
inv1=r2inv*r2inv*r2inv;
|
||||
inv2=inv1;
|
||||
}
|
||||
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
numtyp ir, r2_ia2, r4_ia4, r6_ia6;
|
||||
if (rsq < cut_coulsq) {
|
||||
ir = (numtyp)1.0/sqrt(rsq);
|
||||
prefactor = qqrd2e*qtmp*fetch_q(j,q_);
|
||||
r2_ia2 = rsq*_ia2;
|
||||
r4_ia4 = r2_ia2*r2_ia2;
|
||||
if (smooth==0)
|
||||
forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
|
||||
(numtyp)1.875*r4_ia4)-ir/rsq-
|
||||
factor_coul*ir);
|
||||
else {
|
||||
r6_ia6 = r2_ia2*r4_ia4;
|
||||
forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
|
||||
r2_ia2-(numtyp)8.4375*r4_ia4+
|
||||
(numtyp)2.1875*r6_ia6)-ir/rsq-
|
||||
factor_coul*ir);
|
||||
}
|
||||
} else {
|
||||
forcecoul = (numtyp)0.0;
|
||||
prefactor = (numtyp)0.0;
|
||||
}
|
||||
|
||||
force = forcecoul + force_lj * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
if (smooth==0)
|
||||
e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
|
||||
(numtyp)1.3125*r4_ia4-
|
||||
(numtyp)0.3125*r4_ia4*r2_ia2)-
|
||||
factor_coul*ir);
|
||||
else
|
||||
e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
|
||||
r2_ia2+(numtyp)2.953125*r4_ia4-
|
||||
(numtyp)1.40625*r6_ia6+
|
||||
(numtyp)0.2734375*r4_ia4*r4_ia4));
|
||||
|
||||
if (rsq < lj1[mtype].y) {
|
||||
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
|
||||
lj3[mtype].w;
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
*ap1=e_coul;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, __global numtyp *q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const int smooth, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local numtyp _ia;
|
||||
__local numtyp _ia2;
|
||||
__local numtyp _ia3;
|
||||
_ia=(numtyp)-1.0/sqrt(cut_coulsq);
|
||||
_ia2=(numtyp)1.0/cut_coulsq;
|
||||
_ia3=_ia2*_ia;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<lj1[mtype].x) {
|
||||
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
|
||||
numtyp r2inv=(numtyp)1.0/rsq;
|
||||
|
||||
if (rsq < lj1[mtype].y) {
|
||||
if (lj3[mtype].x == (numtyp)2) {
|
||||
inv1=r2inv*r2inv;
|
||||
inv2=inv1*inv1;
|
||||
} else if (lj3[mtype].x == (numtyp)1) {
|
||||
inv2=r2inv*sqrt(r2inv);
|
||||
inv1=inv2*inv2;
|
||||
} else {
|
||||
inv1=r2inv*r2inv*r2inv;
|
||||
inv2=inv1;
|
||||
}
|
||||
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
numtyp ir, r2_ia2, r4_ia4, r6_ia6;
|
||||
if (rsq < cut_coulsq) {
|
||||
ir = (numtyp)1.0/sqrt(rsq);
|
||||
prefactor = qqrd2e*qtmp*fetch_q(j,q_);
|
||||
r2_ia2 = rsq*_ia2;
|
||||
r4_ia4 = r2_ia2*r2_ia2;
|
||||
if (smooth==0)
|
||||
forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
|
||||
(numtyp)1.875*r4_ia4)-ir/rsq-
|
||||
factor_coul*ir);
|
||||
else {
|
||||
r6_ia6 = r2_ia2*r4_ia4;
|
||||
forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
|
||||
r2_ia2-(numtyp)8.4375*r4_ia4+
|
||||
(numtyp)2.1875*r6_ia6)-ir/rsq-
|
||||
factor_coul*ir);
|
||||
}
|
||||
} else {
|
||||
forcecoul = (numtyp)0.0;
|
||||
prefactor = (numtyp)0.0;
|
||||
}
|
||||
|
||||
force = forcecoul + force_lj * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (rsq < cut_coulsq)
|
||||
if (smooth==0)
|
||||
e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
|
||||
(numtyp)1.3125*r4_ia4-
|
||||
(numtyp)0.3125*r4_ia4*r2_ia2)-
|
||||
factor_coul*ir);
|
||||
else
|
||||
e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
|
||||
r2_ia2+(numtyp)2.953125*r4_ia4-
|
||||
(numtyp)1.40625*r6_ia6+
|
||||
(numtyp)0.2734375*r4_ia4*r4_ia4));
|
||||
if (rsq < lj1[mtype].y) {
|
||||
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
|
||||
lj3[mtype].w;
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
*ap1=e_coul;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
// **************************************************************************
|
||||
// ellipsoid_nbor.cu
|
||||
// -------------------
|
||||
// W. Michael Brown
|
||||
//
|
||||
// Device code for Ellipsoid neighbor routines
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifndef ELLIPSOID_NBOR_H
|
||||
#define ELLIPSOID_NBOR_H
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "nv_kernel_def.h"
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#else
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only unpack neighbors matching the specified inclusive range of forms
|
||||
// -- Only unpack neighbors within cutoff
|
||||
// ---------------------------------------------------------------------------
|
||||
__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
const int ntypes, __global int *dev_nbor,
|
||||
const int nbor_pitch, const int start, const int inum,
|
||||
__global int *dev_ij, const int form_low,
|
||||
const int form_high) {
|
||||
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X+start;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24(iw,ntypes);
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
numtyp2 cf=cut_form[mtype];
|
||||
if (cf.y>=form_low && cf.y<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cf.x) {
|
||||
*packed=j;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
dev_nbor[ii+nbor_pitch]=newj;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
|
||||
// -- Only unpack neighbors matching the specified inclusive range of forms
|
||||
// -- Only unpack neighbors within cutoff
|
||||
// -- Fast version of routine that uses shared memory for LJ constants
|
||||
// ---------------------------------------------------------------------------
|
||||
__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
__global int *dev_nbor, const int nbor_pitch,
|
||||
const int start, const int inum,
|
||||
__global int *dev_ij, const int form_low,
|
||||
const int form_high) {
|
||||
|
||||
int ii=THREAD_ID_X;
|
||||
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
cutsq[ii]=cut_form[ii].x;
|
||||
form[ii]=cut_form[ii].y;
|
||||
}
|
||||
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
|
||||
if (form[mtype]>=form_low && form[mtype]<=form_high) {
|
||||
// Compute r12;
|
||||
numtyp rsq=jx.x-ix.x;
|
||||
rsq*=rsq;
|
||||
numtyp t=jx.y-ix.y;
|
||||
rsq+=t*t;
|
||||
t=jx.z-ix.z;
|
||||
rsq+=t*t;
|
||||
|
||||
if (rsq<cutsq[mtype]) {
|
||||
*packed=j;
|
||||
packed+=nbor_pitch;
|
||||
newj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
dev_nbor[ii+nbor_pitch]=newj;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,429 @@
|
|||
// **************************************************************************
|
||||
// gayberne.cu
|
||||
// -------------------
|
||||
// W. Michael Brown
|
||||
//
|
||||
// Device code for Gay-Berne potential acceleration
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifndef GAYBERNE_CU
|
||||
#define GAYBERNE_CU
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
|
||||
numtyp ans[9])
|
||||
{
|
||||
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
|
||||
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
|
||||
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
|
||||
den = (numtyp)1.0/den;
|
||||
|
||||
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
|
||||
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
|
||||
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
|
||||
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
|
||||
|
||||
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
|
||||
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
|
||||
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
|
||||
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
|
||||
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
|
||||
|
||||
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
|
||||
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
|
||||
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
|
||||
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
|
||||
|
||||
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
|
||||
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
|
||||
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
|
||||
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
|
||||
|
||||
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
|
||||
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
|
||||
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
|
||||
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
|
||||
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
|
||||
|
||||
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
|
||||
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
|
||||
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
|
||||
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
|
||||
|
||||
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
|
||||
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
|
||||
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
|
||||
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
|
||||
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
|
||||
|
||||
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
|
||||
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
|
||||
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
|
||||
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
|
||||
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
|
||||
|
||||
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
|
||||
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
|
||||
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
|
||||
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape, __global numtyp4* well,
|
||||
__global numtyp *gum, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global numtyp *lshape,
|
||||
__global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, const int astride,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=gum[3];
|
||||
sp_lj[1]=gum[4];
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp4 tor;
|
||||
tor.x=(acctyp)0;
|
||||
tor.y=(acctyp)0;
|
||||
tor.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
numtyp4 ishape=shape[itype];
|
||||
{
|
||||
numtyp t[9];
|
||||
gpu_quat_to_mat_trans(q,i,a1);
|
||||
gpu_diag_times3(ishape,a1,t);
|
||||
gpu_transpose_times3(a1,t,g1);
|
||||
gpu_diag_times3(well[itype],a1,t);
|
||||
gpu_transpose_times3(a1,t,b1);
|
||||
}
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
numtyp g2[9];
|
||||
{
|
||||
gpu_diag_times3(shape[jtype],a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
gpu_plus3(g1,g2,g12);
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- replace r12 with r12 hat
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
|
||||
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
|
||||
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
|
||||
}
|
||||
|
||||
// torque for particle 1
|
||||
{
|
||||
numtyp tempv[3], tempv2[3];
|
||||
tempv[0] = -uslj_rsq*kappa[0];
|
||||
tempv[1] = -uslj_rsq*kappa[1];
|
||||
tempv[2] = -uslj_rsq*kappa[2];
|
||||
gpu_row_times3(kappa,g1,tempv2);
|
||||
gpu_cross3(tempv,tempv2,tUr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
|
||||
// Compute teta
|
||||
numtyp temp[9], tempv[3], tempv2[3];
|
||||
compute_eta_torque(g12,a1,ishape,temp);
|
||||
numtyp temp1 = -eta*gum[1];
|
||||
|
||||
tempv[0] = temp1*temp[0];
|
||||
tempv[1] = temp1*temp[1];
|
||||
tempv[2] = temp1*temp[2];
|
||||
gpu_cross3(a1,tempv,tempv2);
|
||||
teta[0] = tempv2[0];
|
||||
teta[1] = tempv2[1];
|
||||
teta[2] = tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[3];
|
||||
tempv[1] = temp1*temp[4];
|
||||
tempv[2] = temp1*temp[5];
|
||||
gpu_cross3(a1+3,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
|
||||
tempv[0] = temp1*temp[6];
|
||||
tempv[1] = temp1*temp[7];
|
||||
tempv[2] = temp1*temp[8];
|
||||
gpu_cross3(a1+6,tempv,tempv2);
|
||||
teta[0] += tempv2[0];
|
||||
teta[1] += tempv2[1];
|
||||
teta[2] += tempv2[2];
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3], tchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b2[9], b12[9];
|
||||
{
|
||||
gpu_diag_times3(well[jtype],a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
gpu_plus3(b1,b2,b12);
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
r12[0]*=r;
|
||||
r12[1]*=r;
|
||||
r12[2]*=r;
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
r12[0]*=ir;
|
||||
r12[1]*=ir;
|
||||
r12[2]*=ir;
|
||||
chi = gpu_dot3(r12,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
|
||||
gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
|
||||
|
||||
// compute t_chi
|
||||
numtyp tempv[3];
|
||||
gpu_row_times3(iota,b1,tempv);
|
||||
gpu_cross3(tempv,iota,tchi);
|
||||
temp1 = (numtyp)-4.0*ir*ir;
|
||||
tchi[0] *= temp1;
|
||||
tchi[1] *= temp1;
|
||||
tchi[2] *= temp1;
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag>0)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag>0) {
|
||||
r12[0]*=-r;
|
||||
r12[1]*=-r;
|
||||
r12[2]*=-r;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.x+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.y+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
f.z+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
f.x+=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.y+=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
|
||||
// Torque on 1
|
||||
temp1 = -u_r*eta*factor_lj;
|
||||
temp2 = -u_r*chi*factor_lj;
|
||||
numtyp temp3 = -chi*eta*factor_lj;
|
||||
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
|
||||
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
|
||||
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[7][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=tor.x;
|
||||
red_acc[4][tid]=tor.y;
|
||||
red_acc[5][tid]=tor.z;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
tor.x=red_acc[3][tid];
|
||||
tor.y=red_acc[4][tid];
|
||||
tor.z=red_acc[5][tid];
|
||||
|
||||
if (eflag>0 || vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
red_acc[6][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<7; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
energy=red_acc[6][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=astride;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=astride;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
ans[ii+astride]=tor;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,594 @@
|
|||
// **************************************************************************
|
||||
// gayberne_lj.cu
|
||||
// -------------------
|
||||
// W. Michael Brown
|
||||
//
|
||||
// Device code for Gay-Berne - Lennard-Jones potential acceleration
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifndef GAYBERNE_LJ_CU
|
||||
#define GAYBERNE_LJ_CU
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape,__global numtyp4* well,
|
||||
__global numtyp *gum, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global numtyp *lshape,
|
||||
__global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag,const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=gum[3];
|
||||
sp_lj[1]=gum[4];
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+stride*numj;
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp oner=shape[itype].x;
|
||||
numtyp one_well=well[itype].x;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r12[3];
|
||||
r12[0] = jx.x-ix.x;
|
||||
r12[1] = jx.y-ix.y;
|
||||
r12[2] = jx.z-ix.z;
|
||||
numtyp ir = gpu_dot3(r12,r12);
|
||||
|
||||
ir = rsqrt(ir);
|
||||
numtyp r = (numtyp)1.0/ir;
|
||||
|
||||
numtyp r12hat[3];
|
||||
r12hat[0]=r12[0]*ir;
|
||||
r12hat[1]=r12[1]*ir;
|
||||
r12hat[2]=r12[2]*ir;
|
||||
|
||||
numtyp a2[9];
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
|
||||
numtyp u_r, dUr[3], eta;
|
||||
{ // Compute U_r, dUr, eta, and teta
|
||||
// Compute g12
|
||||
numtyp g12[9];
|
||||
{
|
||||
{
|
||||
numtyp g2[9];
|
||||
gpu_diag_times3(shape[jtype],a2,g12);
|
||||
gpu_transpose_times3(a2,g12,g2);
|
||||
g12[0]=g2[0]+oner;
|
||||
g12[4]=g2[4]+oner;
|
||||
g12[8]=g2[8]+oner;
|
||||
g12[1]=g2[1];
|
||||
g12[2]=g2[2];
|
||||
g12[3]=g2[3];
|
||||
g12[5]=g2[5];
|
||||
g12[6]=g2[6];
|
||||
g12[7]=g2[7];
|
||||
}
|
||||
|
||||
{ // Compute U_r and dUr
|
||||
|
||||
// Compute kappa
|
||||
numtyp kappa[3];
|
||||
gpu_mldivide3(g12,r12,kappa,err_flag);
|
||||
|
||||
// -- kappa is now / r
|
||||
kappa[0]*=ir;
|
||||
kappa[1]*=ir;
|
||||
kappa[2]*=ir;
|
||||
|
||||
// energy
|
||||
|
||||
// compute u_r and dUr
|
||||
numtyp uslj_rsq;
|
||||
{
|
||||
// Compute distance of closest approach
|
||||
numtyp h12, sigma12;
|
||||
sigma12 = gpu_dot3(r12hat,kappa);
|
||||
sigma12 = rsqrt((numtyp)0.5*sigma12);
|
||||
h12 = r-sigma12;
|
||||
|
||||
// -- kappa is now ok
|
||||
kappa[0]*=r;
|
||||
kappa[1]*=r;
|
||||
kappa[2]*=r;
|
||||
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
numtyp sigma = sig_eps[mtype].x;
|
||||
numtyp epsilon = sig_eps[mtype].y;
|
||||
numtyp varrho = sigma/(h12+gum[0]*sigma);
|
||||
numtyp varrho6 = varrho*varrho*varrho;
|
||||
varrho6*=varrho6;
|
||||
numtyp varrho12 = varrho6*varrho6;
|
||||
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
|
||||
|
||||
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
|
||||
temp1 = temp1*(numtyp)24.0*epsilon;
|
||||
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
|
||||
numtyp temp2 = gpu_dot3(kappa,r12hat);
|
||||
uslj_rsq = uslj_rsq*ir*ir;
|
||||
|
||||
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
|
||||
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
|
||||
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute eta
|
||||
{
|
||||
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
|
||||
numtyp det_g12 = gpu_det3(g12);
|
||||
eta = pow(eta/det_g12,gum[1]);
|
||||
}
|
||||
}
|
||||
|
||||
numtyp chi, dchi[3];
|
||||
{ // Compute chi and dchi
|
||||
|
||||
// Compute b12
|
||||
numtyp b12[9];
|
||||
{
|
||||
numtyp b2[9];
|
||||
gpu_diag_times3(well[jtype],a2,b12);
|
||||
gpu_transpose_times3(a2,b12,b2);
|
||||
b12[0]=b2[0]+one_well;
|
||||
b12[4]=b2[4]+one_well;
|
||||
b12[8]=b2[8]+one_well;
|
||||
b12[1]=b2[1];
|
||||
b12[2]=b2[2];
|
||||
b12[3]=b2[3];
|
||||
b12[5]=b2[5];
|
||||
b12[6]=b2[6];
|
||||
b12[7]=b2[7];
|
||||
}
|
||||
|
||||
// compute chi_12
|
||||
numtyp iota[3];
|
||||
gpu_mldivide3(b12,r12,iota,err_flag);
|
||||
// -- iota is now iota/r
|
||||
iota[0]*=ir;
|
||||
iota[1]*=ir;
|
||||
iota[2]*=ir;
|
||||
chi = gpu_dot3(r12hat,iota);
|
||||
chi = pow(chi*(numtyp)2.0,gum[2]);
|
||||
|
||||
// -- iota is now ok
|
||||
iota[0]*=r;
|
||||
iota[1]*=r;
|
||||
iota[2]*=r;
|
||||
|
||||
numtyp temp1 = gpu_dot3(iota,r12hat);
|
||||
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
|
||||
gum[2]);
|
||||
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
|
||||
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
|
||||
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
|
||||
}
|
||||
|
||||
numtyp temp2 = factor_lj*eta*chi;
|
||||
if (eflag>0)
|
||||
energy+=u_r*temp2;
|
||||
numtyp temp1 = -eta*u_r*factor_lj;
|
||||
if (vflag>0) {
|
||||
r12[0]*=-1;
|
||||
r12[1]*=-1;
|
||||
r12[2]*=-1;
|
||||
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.x+=ft;
|
||||
virial[0]+=r12[0]*ft;
|
||||
ft=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.y+=ft;
|
||||
virial[1]+=r12[1]*ft;
|
||||
virial[3]+=r12[0]*ft;
|
||||
ft=temp1*dchi[2]-temp2*dUr[2];
|
||||
f.z+=ft;
|
||||
virial[2]+=r12[2]*ft;
|
||||
virial[4]+=r12[0]*ft;
|
||||
virial[5]+=r12[1]*ft;
|
||||
} else {
|
||||
f.x+=temp1*dchi[0]-temp2*dUr[0];
|
||||
f.y+=temp1*dchi[1]-temp2*dUr[1];
|
||||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||
}
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=gum[3];
|
||||
sp_lj[1]=gum[4];
|
||||
sp_lj[2]=gum[5];
|
||||
sp_lj[3]=gum[6];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int ii=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
|
||||
force*=factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
|
||||
energy+=factor_lj*(e-lj3[ii].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[ii];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
ans[ii]=old;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in, __global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=gum[tid+3];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[ii];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
ans[ii]=old;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,472 @@
|
|||
// **************************************************************************
|
||||
// lj_class2_long.cu
|
||||
// -------------------
|
||||
// W. Michael Brown
|
||||
//
|
||||
// Device code for COMPASS LJ long acceleration
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin : Mon May 16 2011
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifndef LJCL_GPU_KERNEL
|
||||
#define LJCL_GPU_KERNEL
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
__inline double fetch_q(const int& i, const double *q)
|
||||
{
|
||||
return q[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
__inline float fetch_q(const int& i, const float *q)
|
||||
{
|
||||
return tex1Dfetch(q_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#define EWALD_F (numtyp)1.12837917
|
||||
#define EWALD_P (numtyp)0.3275911
|
||||
#define A1 (numtyp)0.254829592
|
||||
#define A2 (numtyp)-0.284496736
|
||||
#define A3 (numtyp)1.421413741
|
||||
#define A4 (numtyp)-1.453152027
|
||||
#define A5 (numtyp)1.061405429
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum,
|
||||
const int nbor_pitch, __global numtyp *q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
sp_lj[4]=sp_lj_in[4];
|
||||
sp_lj[5]=sp_lj_in[5];
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<lj1[mtype].z) {
|
||||
numtyp r2inv=(numtyp)1.0/rsq;
|
||||
numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
|
||||
|
||||
if (rsq < lj1[mtype].w) {
|
||||
numtyp rinv=sqrt(r2inv);
|
||||
r3inv=r2inv*rinv;
|
||||
r6inv = r3inv*r3inv;
|
||||
force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r = sqrt(rsq);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = exp(-grij*grij);
|
||||
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else {
|
||||
forcecoul = (numtyp)0.0;
|
||||
prefactor = (numtyp)0.0;
|
||||
}
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < lj1[mtype].w) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
*ap1=e_coul;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nbor_pitch, __global numtyp *q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp g_ewald, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<lj1[mtype].z) {
|
||||
numtyp r2inv=(numtyp)1.0/rsq;
|
||||
numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
|
||||
|
||||
if (rsq < lj1[mtype].w) {
|
||||
numtyp rinv=sqrt(r2inv);
|
||||
r3inv=r2inv*rinv;
|
||||
r6inv = r3inv*r3inv;
|
||||
force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp r = sqrt(rsq);
|
||||
numtyp grij = g_ewald * r;
|
||||
numtyp expm2 = exp(-grij*grij);
|
||||
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else {
|
||||
forcecoul = (numtyp)0.0;
|
||||
prefactor = (numtyp)0.0;
|
||||
}
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < lj1[mtype].w) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
red_acc[4][tid]=e_coul;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<5; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
e_coul=red_acc[4][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
*ap1=e_coul;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,526 @@
|
|||
// **************************************************************************
|
||||
// re_squared.cu
|
||||
// -------------------
|
||||
// W. Michael Brown
|
||||
//
|
||||
// Device code for RE-Squared potential acceleration
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin : Fri May 06 2011
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifndef RE_SQUARED_CU
|
||||
#define RE_SQUARED_CU
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
|
||||
{
|
||||
numtyp ans;
|
||||
ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] -
|
||||
m[3]*m2[1]*m[8] + m[3]*m2[2]*m[7] +
|
||||
m[6]*m2[1]*m[5] - m[6]*m2[2]*m[4] +
|
||||
m[0]*m2[4]*m[8] - m[0]*m2[5]*m[7] -
|
||||
m2[3]*m[1]*m[8] + m2[3]*m[2]*m[7] +
|
||||
m[6]*m[1]*m2[5] - m[6]*m[2]*m2[4] +
|
||||
m[0]*m[4]*m2[8] - m[0]*m[5]*m2[7] -
|
||||
m[3]*m[1]*m2[8] + m[3]*m[2]*m2[7] +
|
||||
m2[6]*m[1]*m[5] - m2[6]*m[2]*m[4];
|
||||
return ans;
|
||||
}
|
||||
|
||||
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape, __global numtyp4* well,
|
||||
__global numtyp *splj, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global int *dev_nbor,
|
||||
const int stride, __global acctyp4 *ans,
|
||||
const int astride, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=splj[0];
|
||||
sp_lj[1]=splj[1];
|
||||
sp_lj[2]=splj[2];
|
||||
sp_lj[3]=splj[3];
|
||||
|
||||
__local numtyp b_alpha, cr60;
|
||||
b_alpha=(numtyp)45.0/(numtyp)56.0;
|
||||
cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp4 tor;
|
||||
tor.x=(acctyp)0;
|
||||
tor.y=(acctyp)0;
|
||||
tor.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp a1[9]; // Rotation matrix (lab->body)
|
||||
numtyp aTe1[9]; // A'*E
|
||||
numtyp gamma1[9]; // A'*S^2*A
|
||||
numtyp sa1[9]; // S^2*A;
|
||||
numtyp lA1_0[9], lA1_1[9], lA1_2[9]; // -A*rotation generator (x,y, or z)
|
||||
numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9]; // A'*S^2*lA
|
||||
numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9]; // lAtwo+lA'*sa
|
||||
numtyp4 ishape;
|
||||
|
||||
ishape=shape[itype];
|
||||
numtyp4 ishape2;
|
||||
ishape2.x=ishape.x*ishape.x;
|
||||
ishape2.y=ishape.y*ishape.y;
|
||||
ishape2.z=ishape.z*ishape.z;
|
||||
numtyp ilshape = ishape.x*ishape.y*ishape.z;
|
||||
|
||||
{
|
||||
numtyp aTs[9]; // A1'*S1^2
|
||||
gpu_quat_to_mat_trans(q,i,a1);
|
||||
gpu_transpose_times_diag3(a1,well[itype],aTe1);
|
||||
gpu_transpose_times_diag3(a1,ishape2,aTs);
|
||||
gpu_diag_times3(ishape2,a1,sa1);
|
||||
gpu_times3(aTs,a1,gamma1);
|
||||
gpu_rotation_generator_x(a1,lA1_0);
|
||||
gpu_rotation_generator_y(a1,lA1_1);
|
||||
gpu_rotation_generator_z(a1,lA1_2);
|
||||
gpu_times3(aTs,lA1_0,lAtwo1_0);
|
||||
gpu_transpose_times3(lA1_0,sa1,lAsa1_0);
|
||||
gpu_plus3(lAsa1_0,lAtwo1_0,lAsa1_0);
|
||||
gpu_times3(aTs,lA1_1,lAtwo1_1);
|
||||
gpu_transpose_times3(lA1_1,sa1,lAsa1_1);
|
||||
gpu_plus3(lAsa1_1,lAtwo1_1,lAsa1_1);
|
||||
gpu_times3(aTs,lA1_2,lAtwo1_2);
|
||||
gpu_transpose_times3(lA1_2,sa1,lAsa1_2);
|
||||
gpu_plus3(lAsa1_2,lAtwo1_2,lAsa1_2);
|
||||
}
|
||||
ishape2.x=(numtyp)1.0/ishape2.x;
|
||||
ishape2.y=(numtyp)1.0/ishape2.y;
|
||||
ishape2.z=(numtyp)1.0/ishape2.z;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r[3], rhat[3];
|
||||
numtyp rnorm;
|
||||
r[0] = jx.x-ix.x;
|
||||
r[1] = jx.y-ix.y;
|
||||
r[2] = jx.z-ix.z;
|
||||
rnorm = gpu_dot3(r,r);
|
||||
rnorm = rsqrt(rnorm);
|
||||
rhat[0] = r[0]*rnorm;
|
||||
rhat[1] = r[1]*rnorm;
|
||||
rhat[2] = r[2]*rnorm;
|
||||
|
||||
|
||||
numtyp a2[9]; // Rotation matrix (lab->body)
|
||||
numtyp gamma2[9]; // A'*S^2*A
|
||||
numtyp4 jshape;
|
||||
|
||||
jshape=shape[jtype];
|
||||
numtyp4 jshape2;
|
||||
jshape2.x=jshape.x*jshape.x;
|
||||
jshape2.y=jshape.y*jshape.y;
|
||||
jshape2.z=jshape.z*jshape.z;
|
||||
{
|
||||
numtyp aTs[9]; // A1'*S1^2
|
||||
gpu_quat_to_mat_trans(q,j,a2);
|
||||
gpu_transpose_times_diag3(a2,jshape2,aTs);
|
||||
gpu_times3(aTs,a2,gamma2);
|
||||
}
|
||||
|
||||
numtyp temp[9], s[3], z1[3], z2[3], v1[3], v2[3];
|
||||
numtyp sigma12, sigma1, sigma2;
|
||||
gpu_plus3(gamma1,gamma2,temp);
|
||||
gpu_mldivide3(temp,rhat,s,err_flag);
|
||||
sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
|
||||
gpu_times_column3(a1,rhat,z1);
|
||||
gpu_times_column3(a2,rhat,z2);
|
||||
v1[0] = z1[0]*ishape2.x;
|
||||
v1[1] = z1[1]*ishape2.y;
|
||||
v1[2] = z1[2]*ishape2.z;
|
||||
v2[0] = z2[0]/jshape2.x;
|
||||
v2[1] = z2[1]/jshape2.y;
|
||||
v2[2] = z2[2]/jshape2.z;
|
||||
sigma1 = sqrt(gpu_dot3(z1,v1));
|
||||
sigma2 = sqrt(gpu_dot3(z2,v2));
|
||||
|
||||
numtyp H12[9];
|
||||
numtyp dH;
|
||||
H12[0] = gamma1[0]*sigma1+gamma2[0]*sigma2;
|
||||
H12[1] = gamma1[1]*sigma1+gamma2[1]*sigma2;
|
||||
H12[2] = gamma1[2]*sigma1+gamma2[2]*sigma2;
|
||||
H12[3] = gamma1[3]*sigma1+gamma2[3]*sigma2;
|
||||
H12[4] = gamma1[4]*sigma1+gamma2[4]*sigma2;
|
||||
H12[5] = gamma1[5]*sigma1+gamma2[5]*sigma2;
|
||||
H12[6] = gamma1[6]*sigma1+gamma2[6]*sigma2;
|
||||
H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2;
|
||||
H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2;
|
||||
dH=gpu_det3(H12);
|
||||
|
||||
numtyp sigma1p2, sigma2p2, lambda, nu;
|
||||
sigma1p2 = sigma1*sigma1;
|
||||
sigma2p2 = sigma2*sigma2;
|
||||
numtyp jlshape = jshape.x*jshape.y*jshape.z;
|
||||
lambda = ilshape*sigma1p2 + jlshape*sigma2p2;
|
||||
|
||||
|
||||
sigma1=(numtyp)1.0/sigma1;
|
||||
sigma2=(numtyp)1.0/sigma2;
|
||||
|
||||
nu = sqrt(dH/(sigma1+sigma2));
|
||||
gpu_times3(aTe1,a1,temp);
|
||||
|
||||
numtyp sigma, epsilon;
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
sigma = sig_eps[mtype].x;
|
||||
epsilon = sig_eps[mtype].y*factor_lj;
|
||||
|
||||
numtyp w[3], temp2[9];
|
||||
numtyp h12,eta,chi,sprod,sigh,tprod;
|
||||
numtyp aTe2[9]; // A'*E
|
||||
gpu_transpose_times_diag3(a2,well[jtype],aTe2);
|
||||
gpu_times3(aTe2,a2,temp2);
|
||||
gpu_plus3(temp,temp2,temp);
|
||||
gpu_mldivide3(temp,rhat,w,err_flag);
|
||||
h12 = (numtyp)1.0/rnorm-sigma12;
|
||||
eta = lambda/nu;
|
||||
chi = (numtyp)2.0*gpu_dot3(rhat,w);
|
||||
sprod = ilshape * jlshape;
|
||||
sigh = sigma/h12;
|
||||
tprod = eta*chi*sigh;
|
||||
|
||||
numtyp stemp, Ua;
|
||||
stemp = h12*(numtyp)0.5;
|
||||
Ua = (ishape.x+stemp)*(ishape.y+stemp)*
|
||||
(ishape.z+stemp)*(jshape.x+stemp)*
|
||||
(jshape.y+stemp)*(jshape.z+stemp);
|
||||
Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*sprod/Ua;
|
||||
Ua = epsilon*Ua/(numtyp)-36.0;
|
||||
|
||||
numtyp Ur;
|
||||
stemp = h12/cr60;
|
||||
Ur = (ishape.x+stemp)*(ishape.y+stemp)*
|
||||
(ishape.z+stemp)*(jshape.x+stemp)*
|
||||
(jshape.y+stemp)*(jshape.z+stemp);
|
||||
Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur;
|
||||
numtyp sigh6=sigh*sigh*sigh;
|
||||
sigh6*=sigh6;
|
||||
Ur = epsilon*Ur*sigh6/(numtyp)2025.0;
|
||||
|
||||
energy+=Ua+Ur;
|
||||
|
||||
// force
|
||||
|
||||
numtyp vsigma1[3], vsigma2[3], gsigma1[9], gsigma2[9];
|
||||
numtyp sec, sigma12p3, sigma1p3, sigma2p3;
|
||||
sec = sigma*eta*chi;
|
||||
sigma12p3 = sigma12*sigma12*sigma12;
|
||||
sigma1p3 = sigma1/sigma1p2;
|
||||
sigma2p3 = sigma2/sigma2p2;
|
||||
vsigma1[0] = -sigma1p3*v1[0];
|
||||
vsigma1[1] = -sigma1p3*v1[1];
|
||||
vsigma1[2] = -sigma1p3*v1[2];
|
||||
vsigma2[0] = -sigma2p3*v2[0];
|
||||
vsigma2[1] = -sigma2p3*v2[1];
|
||||
vsigma2[2] = -sigma2p3*v2[2];
|
||||
gsigma1[0] = -gamma1[0]*sigma1p2;
|
||||
gsigma1[1] = -gamma1[1]*sigma1p2;
|
||||
gsigma1[2] = -gamma1[2]*sigma1p2;
|
||||
gsigma1[3] = -gamma1[3]*sigma1p2;
|
||||
gsigma1[4] = -gamma1[4]*sigma1p2;
|
||||
gsigma1[5] = -gamma1[5]*sigma1p2;
|
||||
gsigma1[6] = -gamma1[6]*sigma1p2;
|
||||
gsigma1[7] = -gamma1[7]*sigma1p2;
|
||||
gsigma1[8] = -gamma1[8]*sigma1p2;
|
||||
gsigma2[0] = -gamma2[0]*sigma2p2;
|
||||
gsigma2[1] = -gamma2[1]*sigma2p2;
|
||||
gsigma2[2] = -gamma2[2]*sigma2p2;
|
||||
gsigma2[3] = -gamma2[3]*sigma2p2;
|
||||
gsigma2[4] = -gamma2[4]*sigma2p2;
|
||||
gsigma2[5] = -gamma2[5]*sigma2p2;
|
||||
gsigma2[6] = -gamma2[6]*sigma2p2;
|
||||
gsigma2[7] = -gamma2[7]*sigma2p2;
|
||||
gsigma2[8] = -gamma2[8]*sigma2p2;
|
||||
|
||||
numtyp tsig1sig2, tdH, teta1, teta2;
|
||||
numtyp fourw[3], spr[3];
|
||||
tsig1sig2 = eta/((numtyp)2.0*(sigma1+sigma2));
|
||||
tdH = eta/((numtyp)2.0*dH);
|
||||
teta1 = (numtyp)2.0*eta/lambda;
|
||||
teta2 = teta1*jlshape/sigma2p3;
|
||||
teta1 = teta1*ilshape/sigma1p3;
|
||||
fourw[0] = (numtyp)4.0*w[0];
|
||||
fourw[1] = (numtyp)4.0*w[1];
|
||||
fourw[2] = (numtyp)4.0*w[2];
|
||||
spr[0] = (numtyp)0.5*sigma12p3*s[0];
|
||||
spr[1] = (numtyp)0.5*sigma12p3*s[1];
|
||||
spr[2] = (numtyp)0.5*sigma12p3*s[2];
|
||||
|
||||
numtyp hsec, dspu, pbsu;
|
||||
stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(jshape.x*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(jshape.y*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(jshape.z*(numtyp)2.0+h12);
|
||||
hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
|
||||
dspu = (numtyp)1.0/h12-hsec+stemp;
|
||||
pbsu = (numtyp)3.0*sigma*hsec;
|
||||
|
||||
numtyp dspr, pbsr;
|
||||
stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
|
||||
(numtyp)1.0/(ishape.y*cr60+h12)+
|
||||
(numtyp)1.0/(ishape.z*cr60+h12)+
|
||||
(numtyp)1.0/(jshape.x*cr60+h12)+
|
||||
(numtyp)1.0/(jshape.y*cr60+h12)+
|
||||
(numtyp)1.0/(jshape.z*cr60+h12);
|
||||
hsec = (numtyp)1.0/(h12+b_alpha*sec);
|
||||
dspr = (numtyp)7.0/h12-hsec+stemp;
|
||||
pbsr = b_alpha*sigma*hsec;
|
||||
|
||||
numtyp dH12[9];
|
||||
numtyp dUa, dUr, deta, dchi, ddH, dh12;
|
||||
numtyp dsigma1, dsigma2;
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<3; i++) {
|
||||
numtyp u[3], u1[3], u2[3];
|
||||
u[0] = -rhat[i]*rhat[0];
|
||||
u[1] = -rhat[i]*rhat[1];
|
||||
u[2] = -rhat[i]*rhat[2];
|
||||
u[i] += (numtyp)1.0;
|
||||
u[0] *= rnorm;
|
||||
u[1] *= rnorm;
|
||||
u[2] *= rnorm;
|
||||
gpu_times_column3(a1,u,u1);
|
||||
gpu_times_column3(a2,u,u2);
|
||||
dsigma1=gpu_dot3(u1,vsigma1);
|
||||
dsigma2=gpu_dot3(u2,vsigma2);
|
||||
dH12[0] = dsigma1*gsigma1[0]+dsigma2*gsigma2[0];
|
||||
dH12[1] = dsigma1*gsigma1[1]+dsigma2*gsigma2[1];
|
||||
dH12[2] = dsigma1*gsigma1[2]+dsigma2*gsigma2[2];
|
||||
dH12[3] = dsigma1*gsigma1[3]+dsigma2*gsigma2[3];
|
||||
dH12[4] = dsigma1*gsigma1[4]+dsigma2*gsigma2[4];
|
||||
dH12[5] = dsigma1*gsigma1[5]+dsigma2*gsigma2[5];
|
||||
dH12[6] = dsigma1*gsigma1[6]+dsigma2*gsigma2[6];
|
||||
dH12[7] = dsigma1*gsigma1[7]+dsigma2*gsigma2[7];
|
||||
dH12[8] = dsigma1*gsigma1[8]+dsigma2*gsigma2[8];
|
||||
ddH = det_prime(H12,dH12);
|
||||
deta = (dsigma1+dsigma2)*tsig1sig2;
|
||||
deta -= ddH*tdH;
|
||||
deta -= dsigma1*teta1+dsigma2*teta2;
|
||||
dchi = gpu_dot3(u,fourw);
|
||||
dh12 = rhat[i]+gpu_dot3(u,spr);
|
||||
dUa = pbsu*(eta*dchi+deta*chi)-dh12*dspu;
|
||||
dUr = pbsr*(eta*dchi+deta*chi)-dh12*dspr;
|
||||
numtyp force=dUr*Ur+dUa*Ua;
|
||||
if (i==0) {
|
||||
f.x+=force;
|
||||
if (vflag>0)
|
||||
virial[0]+=-r[0]*force;
|
||||
} else if (i==1) {
|
||||
f.y+=force;
|
||||
if (vflag>0) {
|
||||
virial[1]+=-r[1]*force;
|
||||
virial[3]+=-r[0]*force;
|
||||
}
|
||||
} else {
|
||||
f.z+=force;
|
||||
if (vflag>0) {
|
||||
virial[2]+=-r[2]*force;
|
||||
virial[4]+=-r[0]*force;
|
||||
virial[5]+=-r[1]*force;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// torque on i
|
||||
sigma1=(numtyp)1.0/sigma1;
|
||||
|
||||
numtyp fwae[3], p[3];
|
||||
gpu_row_times3(fourw,aTe1,fwae);
|
||||
|
||||
{
|
||||
gpu_times_column3(lA1_0,rhat,p);
|
||||
dsigma1 = gpu_dot3(p,vsigma1);
|
||||
dH12[0] = lAsa1_0[0]*sigma1+dsigma1*gsigma1[0];
|
||||
dH12[1] = lAsa1_0[1]*sigma1+dsigma1*gsigma1[1];
|
||||
dH12[2] = lAsa1_0[2]*sigma1+dsigma1*gsigma1[2];
|
||||
dH12[3] = lAsa1_0[3]*sigma1+dsigma1*gsigma1[3];
|
||||
dH12[4] = lAsa1_0[4]*sigma1+dsigma1*gsigma1[4];
|
||||
dH12[5] = lAsa1_0[5]*sigma1+dsigma1*gsigma1[5];
|
||||
dH12[6] = lAsa1_0[6]*sigma1+dsigma1*gsigma1[6];
|
||||
dH12[7] = lAsa1_0[7]*sigma1+dsigma1*gsigma1[7];
|
||||
dH12[8] = lAsa1_0[8]*sigma1+dsigma1*gsigma1[8];
|
||||
ddH = det_prime(H12,dH12);
|
||||
deta = tsig1sig2*dsigma1-tdH*ddH;
|
||||
deta -= teta1*dsigma1;
|
||||
numtyp tempv[3];
|
||||
gpu_times_column3(lA1_0,w,tempv);
|
||||
dchi = -gpu_dot3(fwae,tempv);
|
||||
gpu_times_column3(lAtwo1_0,spr,tempv);
|
||||
dh12 = -gpu_dot3(s,tempv);
|
||||
|
||||
dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
|
||||
dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
|
||||
tor.x -= (dUa*Ua+dUr*Ur);
|
||||
}
|
||||
|
||||
{
|
||||
gpu_times_column3(lA1_1,rhat,p);
|
||||
dsigma1 = gpu_dot3(p,vsigma1);
|
||||
dH12[0] = lAsa1_1[0]*sigma1+dsigma1*gsigma1[0];
|
||||
dH12[1] = lAsa1_1[1]*sigma1+dsigma1*gsigma1[1];
|
||||
dH12[2] = lAsa1_1[2]*sigma1+dsigma1*gsigma1[2];
|
||||
dH12[3] = lAsa1_1[3]*sigma1+dsigma1*gsigma1[3];
|
||||
dH12[4] = lAsa1_1[4]*sigma1+dsigma1*gsigma1[4];
|
||||
dH12[5] = lAsa1_1[5]*sigma1+dsigma1*gsigma1[5];
|
||||
dH12[6] = lAsa1_1[6]*sigma1+dsigma1*gsigma1[6];
|
||||
dH12[7] = lAsa1_1[7]*sigma1+dsigma1*gsigma1[7];
|
||||
dH12[8] = lAsa1_1[8]*sigma1+dsigma1*gsigma1[8];
|
||||
ddH = det_prime(H12,dH12);
|
||||
deta = tsig1sig2*dsigma1-tdH*ddH;
|
||||
deta -= teta1*dsigma1;
|
||||
numtyp tempv[3];
|
||||
gpu_times_column3(lA1_1,w,tempv);
|
||||
dchi = -gpu_dot3(fwae,tempv);
|
||||
gpu_times_column3(lAtwo1_1,spr,tempv);
|
||||
dh12 = -gpu_dot3(s,tempv);
|
||||
|
||||
dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
|
||||
dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
|
||||
tor.y -= (dUa*Ua+dUr*Ur);
|
||||
}
|
||||
|
||||
{
|
||||
gpu_times_column3(lA1_2,rhat,p);
|
||||
dsigma1 = gpu_dot3(p,vsigma1);
|
||||
dH12[0] = lAsa1_2[0]*sigma1+dsigma1*gsigma1[0];
|
||||
dH12[1] = lAsa1_2[1]*sigma1+dsigma1*gsigma1[1];
|
||||
dH12[2] = lAsa1_2[2]*sigma1+dsigma1*gsigma1[2];
|
||||
dH12[3] = lAsa1_2[3]*sigma1+dsigma1*gsigma1[3];
|
||||
dH12[4] = lAsa1_2[4]*sigma1+dsigma1*gsigma1[4];
|
||||
dH12[5] = lAsa1_2[5]*sigma1+dsigma1*gsigma1[5];
|
||||
dH12[6] = lAsa1_2[6]*sigma1+dsigma1*gsigma1[6];
|
||||
dH12[7] = lAsa1_2[7]*sigma1+dsigma1*gsigma1[7];
|
||||
dH12[8] = lAsa1_2[8]*sigma1+dsigma1*gsigma1[8];
|
||||
ddH = det_prime(H12,dH12);
|
||||
deta = tsig1sig2*dsigma1-tdH*ddH;
|
||||
deta -= teta1*dsigma1;
|
||||
numtyp tempv[3];
|
||||
gpu_times_column3(lA1_2,w,tempv);
|
||||
dchi = -gpu_dot3(fwae,tempv);
|
||||
gpu_times_column3(lAtwo1_2,spr,tempv);
|
||||
dh12 = -gpu_dot3(s,tempv);
|
||||
|
||||
dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
|
||||
dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
|
||||
tor.z -= (dUa*Ua+dUr*Ur);
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[7][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=tor.x;
|
||||
red_acc[4][tid]=tor.y;
|
||||
red_acc[5][tid]=tor.z;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
tor.x=red_acc[3][tid];
|
||||
tor.y=red_acc[4][tid];
|
||||
tor.z=red_acc[5][tid];
|
||||
|
||||
if (eflag>0 || vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
red_acc[6][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<7; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
energy=red_acc[6][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=astride;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=astride;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
ans[ii+astride]=tor;
|
||||
} // if ii
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,892 @@
|
|||
// **************************************************************************
|
||||
// re_squared_lj.cu
|
||||
// -------------------
|
||||
// W. Michael Brown
|
||||
//
|
||||
// Device code for RE-Squared - Lennard-Jones potential acceleration
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin : Fri May 06 2011
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifndef RE_SQUARED_LJ_CU
|
||||
#define RE_SQUARED_LJ_CU
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
|
||||
__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape, __global numtyp4* well,
|
||||
__global numtyp *splj, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, const int astride,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=splj[0];
|
||||
sp_lj[1]=splj[1];
|
||||
sp_lj[2]=splj[2];
|
||||
sp_lj[3]=splj[3];
|
||||
|
||||
__local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
|
||||
b_alpha=(numtyp)45.0/(numtyp)56.0;
|
||||
cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);
|
||||
solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
|
||||
solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp4 tor;
|
||||
tor.x=(acctyp)0;
|
||||
tor.y=(acctyp)0;
|
||||
tor.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp a[9]; // Rotation matrix (lab->body)
|
||||
numtyp aTe[9]; // A'*E
|
||||
numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z)
|
||||
|
||||
numtyp4 ishape;
|
||||
ishape=shape[itype];
|
||||
numtyp ilshape=ishape.x*ishape.y*ishape.z;
|
||||
|
||||
{
|
||||
gpu_quat_to_mat_trans(q,i,a);
|
||||
gpu_transpose_times_diag3(a,well[itype],aTe);
|
||||
gpu_rotation_generator_x(a,lA_0);
|
||||
gpu_rotation_generator_y(a,lA_1);
|
||||
gpu_rotation_generator_z(a,lA_2);
|
||||
}
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp r[3], rhat[3];
|
||||
numtyp rnorm;
|
||||
r[0] = jx.x-ix.x;
|
||||
r[1] = jx.y-ix.y;
|
||||
r[2] = jx.z-ix.z;
|
||||
rnorm = gpu_dot3(r,r);
|
||||
rnorm = rsqrt(rnorm);
|
||||
rhat[0] = r[0]*rnorm;
|
||||
rhat[1] = r[1]*rnorm;
|
||||
rhat[2] = r[2]*rnorm;
|
||||
|
||||
numtyp sigma, epsilon;
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
sigma = sig_eps[mtype].x;
|
||||
epsilon = sig_eps[mtype].y*factor_lj;
|
||||
|
||||
numtyp aTs[9];
|
||||
numtyp4 scorrect;
|
||||
numtyp half_sigma=sigma*(numtyp)0.5;
|
||||
scorrect.x = ishape.x+half_sigma;
|
||||
scorrect.y = ishape.y+half_sigma;
|
||||
scorrect.z = ishape.z+half_sigma;
|
||||
scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
|
||||
scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
|
||||
scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
|
||||
gpu_transpose_times_diag3(a,scorrect,aTs);
|
||||
|
||||
// energy
|
||||
|
||||
numtyp gamma[9], s[3];
|
||||
gpu_times3(aTs,a,gamma);
|
||||
gpu_mldivide3(gamma,rhat,s,err_flag);
|
||||
|
||||
numtyp sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
|
||||
numtyp temp[9], w[3];
|
||||
gpu_times3(aTe,a,temp);
|
||||
temp[0] += (numtyp)1.0;
|
||||
temp[4] += (numtyp)1.0;
|
||||
temp[8] += (numtyp)1.0;
|
||||
gpu_mldivide3(temp,rhat,w,err_flag);
|
||||
|
||||
numtyp h12 = (numtyp)1.0/rnorm-sigma12;
|
||||
numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
|
||||
numtyp sigh = sigma/h12;
|
||||
numtyp tprod = chi*sigh;
|
||||
|
||||
numtyp Ua, Ur;
|
||||
numtyp h12p3 = h12*h12*h12;
|
||||
numtyp sigmap3 = sigma*sigma*sigma;
|
||||
numtyp stemp = h12*(numtyp)0.5;
|
||||
Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
|
||||
Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
|
||||
Ua = epsilon*Ua*sigmap3*solv_f_a;
|
||||
|
||||
stemp = h12/cr60;
|
||||
Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
|
||||
(numtyp)60.0;
|
||||
Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
|
||||
numtyp sigh6=sigh*sigh*sigh;
|
||||
sigh6*=sigh6;
|
||||
Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
|
||||
|
||||
energy+=Ua+Ur;
|
||||
|
||||
// force
|
||||
|
||||
numtyp fourw[3], spr[3];
|
||||
numtyp sec = sigma*chi;
|
||||
numtyp sigma12p3 = sigma12*sigma12*sigma12;
|
||||
fourw[0] = (numtyp)4.0*w[0];
|
||||
fourw[1] = (numtyp)4.0*w[1];
|
||||
fourw[2] = (numtyp)4.0*w[2];
|
||||
spr[0] = (numtyp)0.5*sigma12p3*s[0];
|
||||
spr[1] = (numtyp)0.5*sigma12p3*s[1];
|
||||
spr[2] = (numtyp)0.5*sigma12p3*s[2];
|
||||
|
||||
stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
|
||||
(numtyp)3.0/h12;
|
||||
numtyp hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
|
||||
numtyp dspu = (numtyp)1.0/h12-hsec+stemp;
|
||||
numtyp pbsu = (numtyp)3.0*sigma*hsec;
|
||||
|
||||
stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
|
||||
(numtyp)1.0/(ishape.y*cr60+h12)+
|
||||
(numtyp)1.0/(ishape.z*cr60+h12)+
|
||||
(numtyp)3.0/h12;
|
||||
hsec = (numtyp)1.0/(h12+b_alpha*sec);
|
||||
numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
|
||||
numtyp pbsr = b_alpha*sigma*hsec;
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<3; i++) {
|
||||
numtyp u[3];
|
||||
u[0] = -rhat[i]*rhat[0];
|
||||
u[1] = -rhat[i]*rhat[1];
|
||||
u[2] = -rhat[i]*rhat[2];
|
||||
u[i] += (numtyp)1.0;
|
||||
u[0] *= rnorm;
|
||||
u[1] *= rnorm;
|
||||
u[2] *= rnorm;
|
||||
numtyp dchi = gpu_dot3(u,fourw);
|
||||
numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
|
||||
numtyp dUa = pbsu*dchi-dh12*dspu;
|
||||
numtyp dUr = pbsr*dchi-dh12*dspr;
|
||||
numtyp force=dUr*Ur+dUa*Ua;
|
||||
if (i==0) {
|
||||
f.x+=force;
|
||||
if (vflag>0)
|
||||
virial[0]+=-r[0]*force;
|
||||
} else if (i==1) {
|
||||
f.y+=force;
|
||||
if (vflag>0) {
|
||||
virial[1]+=-r[1]*force;
|
||||
virial[3]+=-r[0]*force;
|
||||
}
|
||||
} else {
|
||||
f.z+=force;
|
||||
if (vflag>0) {
|
||||
virial[2]+=-r[2]*force;
|
||||
virial[4]+=-r[0]*force;
|
||||
virial[5]+=-r[1]*force;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// torque on i
|
||||
numtyp fwae[3];
|
||||
gpu_row_times3(fourw,aTe,fwae);
|
||||
{
|
||||
numtyp tempv[3], p[3], lAtwo[9];
|
||||
gpu_times_column3(lA_0,rhat,p);
|
||||
gpu_times_column3(lA_0,w,tempv);
|
||||
numtyp dchi = -gpu_dot3(fwae,tempv);
|
||||
gpu_times3(aTs,lA_0,lAtwo);
|
||||
gpu_times_column3(lAtwo,spr,tempv);
|
||||
numtyp dh12 = -gpu_dot3(s,tempv);
|
||||
numtyp dUa = pbsu*dchi-dh12*dspu;
|
||||
numtyp dUr = pbsr*dchi-dh12*dspr;
|
||||
tor.x -= (dUa*Ua+dUr*Ur);
|
||||
}
|
||||
|
||||
{
|
||||
numtyp tempv[3], p[3], lAtwo[9];
|
||||
gpu_times_column3(lA_1,rhat,p);
|
||||
gpu_times_column3(lA_1,w,tempv);
|
||||
numtyp dchi = -gpu_dot3(fwae,tempv);
|
||||
gpu_times3(aTs,lA_1,lAtwo);
|
||||
gpu_times_column3(lAtwo,spr,tempv);
|
||||
numtyp dh12 = -gpu_dot3(s,tempv);
|
||||
numtyp dUa = pbsu*dchi-dh12*dspu;
|
||||
numtyp dUr = pbsr*dchi-dh12*dspr;
|
||||
tor.y -= (dUa*Ua+dUr*Ur);
|
||||
}
|
||||
|
||||
{
|
||||
numtyp tempv[3], p[3], lAtwo[9];
|
||||
gpu_times_column3(lA_2,rhat,p);
|
||||
gpu_times_column3(lA_2,w,tempv);
|
||||
numtyp dchi = -gpu_dot3(fwae,tempv);
|
||||
gpu_times3(aTs,lA_2,lAtwo);
|
||||
gpu_times_column3(lAtwo,spr,tempv);
|
||||
numtyp dh12 = -gpu_dot3(s,tempv);
|
||||
numtyp dUa = pbsu*dchi-dh12*dspu;
|
||||
numtyp dUr = pbsr*dchi-dh12*dspr;
|
||||
tor.z -= (dUa*Ua+dUr*Ur);
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[7][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=tor.x;
|
||||
red_acc[4][tid]=tor.y;
|
||||
red_acc[5][tid]=tor.z;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
tor.x=red_acc[3][tid];
|
||||
tor.y=red_acc[4][tid];
|
||||
tor.z=red_acc[5][tid];
|
||||
|
||||
if (eflag>0 || vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
red_acc[6][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<7; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
energy=red_acc[6][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
ap1+=astride;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=astride;
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[ii];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
ans[ii]=old;
|
||||
|
||||
old=ans[ii+astride];
|
||||
old.x+=tor.x;
|
||||
old.y+=tor.y;
|
||||
old.z+=tor.z;
|
||||
ans[ii+astride]=old;
|
||||
} // if ii
|
||||
|
||||
}
|
||||
|
||||
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape,__global numtyp4* well,
|
||||
__global numtyp *splj, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global int *dev_nbor,
|
||||
const int stride, __global acctyp4 *ans,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag,const int start,
|
||||
const int inum, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=splj[0];
|
||||
sp_lj[1]=splj[1];
|
||||
sp_lj[2]=splj[2];
|
||||
sp_lj[3]=splj[3];
|
||||
|
||||
__local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
|
||||
b_alpha=(numtyp)45.0/(numtyp)56.0;
|
||||
cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);
|
||||
solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
|
||||
solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int j=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int i=*nbor;
|
||||
factor_lj = sp_lj[sbmask(i)];
|
||||
i &= NEIGHMASK;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp a[9]; // Rotation matrix (lab->body)
|
||||
numtyp aTe[9]; // A'*E
|
||||
numtyp4 ishape;
|
||||
|
||||
ishape=shape[itype];
|
||||
gpu_quat_to_mat_trans(q,i,a);
|
||||
gpu_transpose_times_diag3(a,well[itype],aTe);
|
||||
|
||||
// Compute r12
|
||||
numtyp r[3], rhat[3];
|
||||
numtyp rnorm;
|
||||
r[0] = ix.x-jx.x;
|
||||
r[1] = ix.y-jx.y;
|
||||
r[2] = ix.z-jx.z;
|
||||
rnorm = gpu_dot3(r,r);
|
||||
rnorm = rsqrt(rnorm);
|
||||
rhat[0] = r[0]*rnorm;
|
||||
rhat[1] = r[1]*rnorm;
|
||||
rhat[2] = r[2]*rnorm;
|
||||
|
||||
numtyp sigma, epsilon;
|
||||
int mtype=mul24(ntypes,itype)+jtype;
|
||||
sigma = sig_eps[mtype].x;
|
||||
epsilon = sig_eps[mtype].y*factor_lj;
|
||||
|
||||
numtyp aTs[9];
|
||||
numtyp4 scorrect;
|
||||
numtyp half_sigma=sigma * (numtyp)0.5;
|
||||
scorrect.x = ishape.x+half_sigma;
|
||||
scorrect.y = ishape.y+half_sigma;
|
||||
scorrect.z = ishape.z+half_sigma;
|
||||
scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
|
||||
scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
|
||||
scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
|
||||
gpu_transpose_times_diag3(a,scorrect,aTs);
|
||||
|
||||
// energy
|
||||
|
||||
numtyp gamma[9], s[3];
|
||||
gpu_times3(aTs,a,gamma);
|
||||
gpu_mldivide3(gamma,rhat,s,err_flag);
|
||||
|
||||
numtyp sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
|
||||
numtyp temp[9], w[3];
|
||||
gpu_times3(aTe,a,temp);
|
||||
temp[0] += (numtyp)1.0;
|
||||
temp[4] += (numtyp)1.0;
|
||||
temp[8] += (numtyp)1.0;
|
||||
gpu_mldivide3(temp,rhat,w,err_flag);
|
||||
|
||||
numtyp h12 = (numtyp)1.0/rnorm-sigma12;
|
||||
numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
|
||||
numtyp sigh = sigma/h12;
|
||||
numtyp tprod = chi*sigh;
|
||||
|
||||
numtyp Ua, Ur;
|
||||
numtyp h12p3 = h12*h12*h12;
|
||||
numtyp sigmap3 = sigma*sigma*sigma;
|
||||
numtyp stemp = h12/(numtyp)2.0;
|
||||
Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
|
||||
numtyp ilshape=ishape.x*ishape.y*ishape.z;
|
||||
Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
|
||||
Ua = epsilon*Ua*sigmap3*solv_f_a;
|
||||
|
||||
stemp = h12/cr60;
|
||||
Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
|
||||
(numtyp)60.0;
|
||||
Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
|
||||
numtyp sigh6=sigh*sigh*sigh;
|
||||
sigh6*=sigh6;
|
||||
Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
|
||||
|
||||
energy+=Ua+Ur;
|
||||
|
||||
// force
|
||||
|
||||
numtyp fourw[3], spr[3];
|
||||
numtyp sec = sigma*chi;
|
||||
numtyp sigma12p3 = sigma12*sigma12*sigma12;
|
||||
fourw[0] = (numtyp)4.0*w[0];
|
||||
fourw[1] = (numtyp)4.0*w[1];
|
||||
fourw[2] = (numtyp)4.0*w[2];
|
||||
spr[0] = (numtyp)0.5*sigma12p3*s[0];
|
||||
spr[1] = (numtyp)0.5*sigma12p3*s[1];
|
||||
spr[2] = (numtyp)0.5*sigma12p3*s[2];
|
||||
|
||||
stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
|
||||
(numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
|
||||
(numtyp)3.0/h12;
|
||||
numtyp hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
|
||||
numtyp dspu = (numtyp)1.0/h12-hsec+stemp;
|
||||
numtyp pbsu = (numtyp)3.0*sigma*hsec;
|
||||
|
||||
stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
|
||||
(numtyp)1.0/(ishape.y*cr60+h12)+
|
||||
(numtyp)1.0/(ishape.z*cr60+h12)+
|
||||
(numtyp)3.0/h12;
|
||||
hsec = (numtyp)1.0/(h12+b_alpha*sec);
|
||||
numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
|
||||
numtyp pbsr = b_alpha*sigma*hsec;
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<3; i++) {
|
||||
numtyp u[3];
|
||||
u[0] = -rhat[i]*rhat[0];
|
||||
u[1] = -rhat[i]*rhat[1];
|
||||
u[2] = -rhat[i]*rhat[2];
|
||||
u[i] += (numtyp)1.0;
|
||||
u[0] *= rnorm;
|
||||
u[1] *= rnorm;
|
||||
u[2] *= rnorm;
|
||||
numtyp dchi = gpu_dot3(u,fourw);
|
||||
numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
|
||||
numtyp dUa = pbsu*dchi-dh12*dspu;
|
||||
numtyp dUr = pbsr*dchi-dh12*dspr;
|
||||
numtyp force=dUr*Ur+dUa*Ua;
|
||||
if (i==0) {
|
||||
f.x+=force;
|
||||
if (vflag>0)
|
||||
virial[0]+=-r[0]*force;
|
||||
} else if (i==1) {
|
||||
f.y+=force;
|
||||
if (vflag>0) {
|
||||
virial[1]+=-r[1]*force;
|
||||
virial[3]+=-r[0]*force;
|
||||
}
|
||||
} else {
|
||||
f.z+=force;
|
||||
if (vflag>0) {
|
||||
virial[2]+=-r[2]*force;
|
||||
virial[4]+=-r[0]*force;
|
||||
virial[5]+=-r[1]*force;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[7][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<3; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
|
||||
if (eflag>0 || vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
red_acc[6][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<7; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
energy=red_acc[6][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=gum[0];
|
||||
sp_lj[1]=gum[1];
|
||||
sp_lj[2]=gum[2];
|
||||
sp_lj[3]=gum[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int ii=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
|
||||
force*=factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
|
||||
energy+=factor_lj*(e-lj3[ii].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[ii];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
ans[ii]=old;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in, __global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom+start;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=gum[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_ij+ii;
|
||||
int i=*nbor;
|
||||
nbor+=stride;
|
||||
int numj=*nbor;
|
||||
nbor+=stride;
|
||||
__global int *list_end=nbor+mul24(stride,numj);
|
||||
nbor+=mul24(offset,stride);
|
||||
int n_stride=mul24(t_per_atom,stride);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
|
||||
r2inv=(numtyp)1.0/r2inv;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1+=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1+=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
acctyp4 old=ans[ii];
|
||||
old.x+=f.x;
|
||||
old.y+=f.y;
|
||||
old.z+=f.z;
|
||||
ans[ii]=old;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
#!/bin/tcsh
|
||||
|
||||
set files = `echo *.h *.cpp *.cu`
|
||||
rm -rf /tmp/cpp5678
|
||||
mkdir /tmp/cpp5678
|
||||
mkdir /tmp/cpp5678/lgpu
|
||||
|
||||
foreach file ( $files )
|
||||
# /bin/cp $file /tmp/cpp5678/$file:t:t
|
||||
# ------ Sed Replace
|
||||
sed -i 's/atom->dev_engv/ans->dev_engv/g' $file
|
||||
end
|
||||
|
Loading…
Reference in New Issue