git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6407 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2011-06-14 13:34:06 +00:00
parent 9418bacff7
commit 945343265e
8 changed files with 3622 additions and 0 deletions

View File

@ -0,0 +1,531 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
------------------------------------------------------------------------- */
#ifndef CMMM_GPU_KERNEL
#define CMMM_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const int smooth, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
__local numtyp _ia;
__local numtyp _ia2;
__local numtyp _ia3;
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
_ia=(numtyp)-1.0/sqrt(cut_coulsq);
_ia2=(numtyp)-1.0/cut_coulsq;
_ia3=_ia2*_ia;
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
numtyp ir, r2_ia2, r4_ia4, r6_ia6;
if (rsq < cut_coulsq) {
ir = (numtyp)1.0/sqrt(rsq);
prefactor = qqrd2e*qtmp*fetch_q(j,q_);
r2_ia2 = rsq*_ia2;
r4_ia4 = r2_ia2*r2_ia2;
if (smooth==0)
forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
(numtyp)1.875*r4_ia4)-ir/rsq-
factor_coul*ir);
else {
r6_ia6 = r2_ia2*r4_ia4;
forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
r2_ia2-(numtyp)8.4375*r4_ia4+
(numtyp)2.1875*r6_ia6)-ir/rsq-
factor_coul*ir);
}
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = forcecoul + force_lj * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
if (smooth==0)
e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
(numtyp)1.3125*r4_ia4-
(numtyp)0.3125*r4_ia4*r2_ia2)-
factor_coul*ir);
else
e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
r2_ia2+(numtyp)2.953125*r4_ia4-
(numtyp)1.40625*r6_ia6+
(numtyp)0.2734375*r4_ia4*r4_ia4));
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const int smooth, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__local numtyp _ia;
__local numtyp _ia2;
__local numtyp _ia3;
_ia=(numtyp)-1.0/sqrt(cut_coulsq);
_ia2=(numtyp)1.0/cut_coulsq;
_ia3=_ia2*_ia;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].x) {
numtyp forcecoul, force_lj, force, inv1, inv2, prefactor;
numtyp r2inv=(numtyp)1.0/rsq;
if (rsq < lj1[mtype].y) {
if (lj3[mtype].x == (numtyp)2) {
inv1=r2inv*r2inv;
inv2=inv1*inv1;
} else if (lj3[mtype].x == (numtyp)1) {
inv2=r2inv*sqrt(r2inv);
inv1=inv2*inv2;
} else {
inv1=r2inv*r2inv*r2inv;
inv2=inv1;
}
force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
} else
force_lj = (numtyp)0.0;
numtyp ir, r2_ia2, r4_ia4, r6_ia6;
if (rsq < cut_coulsq) {
ir = (numtyp)1.0/sqrt(rsq);
prefactor = qqrd2e*qtmp*fetch_q(j,q_);
r2_ia2 = rsq*_ia2;
r4_ia4 = r2_ia2*r2_ia2;
if (smooth==0)
forcecoul = prefactor*(_ia3*((numtyp)-4.375+(numtyp)5.25*r2_ia2-
(numtyp)1.875*r4_ia4)-ir/rsq-
factor_coul*ir);
else {
r6_ia6 = r2_ia2*r4_ia4;
forcecoul = prefactor*(_ia3*((numtyp)-6.5625+(numtyp)11.8125*
r2_ia2-(numtyp)8.4375*r4_ia4+
(numtyp)2.1875*r6_ia6)-ir/rsq-
factor_coul*ir);
}
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = forcecoul + force_lj * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
if (smooth==0)
e_coul += prefactor*(ir+_ia*((numtyp)2.1875-(numtyp)2.1875*r2_ia2+
(numtyp)1.3125*r4_ia4-
(numtyp)0.3125*r4_ia4*r2_ia2)-
factor_coul*ir);
else
e_coul += prefactor*(ir+_ia*((numtyp)2.4609375-(numtyp)3.28125*
r2_ia2+(numtyp)2.953125*r4_ia4-
(numtyp)1.40625*r6_ia6+
(numtyp)0.2734375*r4_ia4*r4_ia4));
if (rsq < lj1[mtype].y) {
energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
lj3[mtype].w;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

165
lib/gpu/ellipsoid_nbor.cu Normal file
View File

@ -0,0 +1,165 @@
// **************************************************************************
// ellipsoid_nbor.cu
// -------------------
// W. Michael Brown
//
// Device code for Ellipsoid neighbor routines
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef ELLIPSOID_NBOR_H
#define ELLIPSOID_NBOR_H
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#else
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// ---------------------------------------------------------------------------
__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
const int ntypes, __global int *dev_nbor,
const int nbor_pitch, const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high) {
// ii indexes the two interacting particles in gi
int ii=GLOBAL_ID_X+start;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24(iw,ntypes);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
numtyp2 cf=cut_form[mtype];
if (cf.y>=form_low && cf.y<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cf.x) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
// ---------------------------------------------------------------------------
// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
// -- Only unpack neighbors matching the specified inclusive range of forms
// -- Only unpack neighbors within cutoff
// -- Fast version of routine that uses shared memory for LJ constants
// ---------------------------------------------------------------------------
__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
__global int *dev_nbor, const int nbor_pitch,
const int start, const int inum,
__global int *dev_ij, const int form_low,
const int form_high) {
int ii=THREAD_ID_X;
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
cutsq[ii]=cut_form[ii].x;
form[ii]=cut_form[ii].y;
}
ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
__global int *list_end=nbor+mul24(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
if (form[mtype]>=form_low && form[mtype]<=form_high) {
// Compute r12;
numtyp rsq=jx.x-ix.x;
rsq*=rsq;
numtyp t=jx.y-ix.y;
rsq+=t*t;
t=jx.z-ix.z;
rsq+=t*t;
if (rsq<cutsq[mtype]) {
*packed=j;
packed+=nbor_pitch;
newj++;
}
}
}
dev_nbor[ii+nbor_pitch]=newj;
}
}
#endif

429
lib/gpu/gayberne.cu Normal file
View File

@ -0,0 +1,429 @@
// **************************************************************************
// gayberne.cu
// -------------------
// W. Michael Brown
//
// Device code for Gay-Berne potential acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef GAYBERNE_CU
#define GAYBERNE_CU
#ifdef NV_KERNEL
#include "ellipsoid_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
numtyp ans[9])
{
numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
den = (numtyp)1.0/den;
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
numtyp4 ishape=shape[itype];
{
numtyp t[9];
gpu_quat_to_mat_trans(q,i,a1);
gpu_diag_times3(ishape,a1,t);
gpu_transpose_times3(a1,t,g1);
gpu_diag_times3(well[itype],a1,t);
gpu_transpose_times3(a1,t,b1);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], tUr[3], eta, teta[3];
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
numtyp g2[9];
{
gpu_diag_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
gpu_plus3(g1,g2,g12);
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- replace r12 with r12 hat
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
}
// torque for particle 1
{
numtyp tempv[3], tempv2[3];
tempv[0] = -uslj_rsq*kappa[0];
tempv[1] = -uslj_rsq*kappa[1];
tempv[2] = -uslj_rsq*kappa[2];
gpu_row_times3(kappa,g1,tempv2);
gpu_cross3(tempv,tempv2,tUr);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
// Compute teta
numtyp temp[9], tempv[3], tempv2[3];
compute_eta_torque(g12,a1,ishape,temp);
numtyp temp1 = -eta*gum[1];
tempv[0] = temp1*temp[0];
tempv[1] = temp1*temp[1];
tempv[2] = temp1*temp[2];
gpu_cross3(a1,tempv,tempv2);
teta[0] = tempv2[0];
teta[1] = tempv2[1];
teta[2] = tempv2[2];
tempv[0] = temp1*temp[3];
tempv[1] = temp1*temp[4];
tempv[2] = temp1*temp[5];
gpu_cross3(a1+3,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
tempv[0] = temp1*temp[6];
tempv[1] = temp1*temp[7];
tempv[2] = temp1*temp[8];
gpu_cross3(a1+6,tempv,tempv2);
teta[0] += tempv2[0];
teta[1] += tempv2[1];
teta[2] += tempv2[2];
}
numtyp chi, dchi[3], tchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b2[9], b12[9];
{
gpu_diag_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
gpu_plus3(b1,b2,b12);
}
// compute chi_12
r12[0]*=r;
r12[1]*=r;
r12[2]*=r;
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
r12[0]*=ir;
r12[1]*=ir;
r12[2]*=ir;
chi = gpu_dot3(r12,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12[0]);
dchi[1] = temp2*(iota[1]-temp1*r12[1]);
dchi[2] = temp2*(iota[2]-temp1*r12[2]);
// compute t_chi
numtyp tempv[3];
gpu_row_times3(iota,b1,tempv);
gpu_cross3(tempv,iota,tchi);
temp1 = (numtyp)-4.0*ir*ir;
tchi[0] *= temp1;
tchi[1] *= temp1;
tchi[2] *= temp1;
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-r;
r12[1]*=-r;
r12[2]*=-r;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
// Torque on 1
temp1 = -u_r*eta*factor_lj;
temp2 = -u_r*chi*factor_lj;
numtyp temp3 = -chi*eta*factor_lj;
tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=tor.x;
red_acc[4][tid]=tor.y;
red_acc[5][tid]=tor.z;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
tor.x=red_acc[3][tid];
tor.y=red_acc[4][tid];
tor.z=red_acc[5][tid];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=astride;
}
}
ans[ii]=f;
ans[ii+astride]=tor;
} // if ii
}
#endif

594
lib/gpu/gayberne_lj.cu Normal file
View File

@ -0,0 +1,594 @@
// **************************************************************************
// gayberne_lj.cu
// -------------------
// W. Michael Brown
//
// Device code for Gay-Berne - Lennard-Jones potential acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef GAYBERNE_LJ_CU
#define GAYBERNE_LJ_CU
#ifdef NV_KERNEL
#include "ellipsoid_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
__global numtyp4* shape,__global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag,const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+stride*numj;
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp oner=shape[itype].x;
numtyp one_well=well[itype].x;
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r12[3];
r12[0] = jx.x-ix.x;
r12[1] = jx.y-ix.y;
r12[2] = jx.z-ix.z;
numtyp ir = gpu_dot3(r12,r12);
ir = rsqrt(ir);
numtyp r = (numtyp)1.0/ir;
numtyp r12hat[3];
r12hat[0]=r12[0]*ir;
r12hat[1]=r12[1]*ir;
r12hat[2]=r12[2]*ir;
numtyp a2[9];
gpu_quat_to_mat_trans(q,j,a2);
numtyp u_r, dUr[3], eta;
{ // Compute U_r, dUr, eta, and teta
// Compute g12
numtyp g12[9];
{
{
numtyp g2[9];
gpu_diag_times3(shape[jtype],a2,g12);
gpu_transpose_times3(a2,g12,g2);
g12[0]=g2[0]+oner;
g12[4]=g2[4]+oner;
g12[8]=g2[8]+oner;
g12[1]=g2[1];
g12[2]=g2[2];
g12[3]=g2[3];
g12[5]=g2[5];
g12[6]=g2[6];
g12[7]=g2[7];
}
{ // Compute U_r and dUr
// Compute kappa
numtyp kappa[3];
gpu_mldivide3(g12,r12,kappa,err_flag);
// -- kappa is now / r
kappa[0]*=ir;
kappa[1]*=ir;
kappa[2]*=ir;
// energy
// compute u_r and dUr
numtyp uslj_rsq;
{
// Compute distance of closest approach
numtyp h12, sigma12;
sigma12 = gpu_dot3(r12hat,kappa);
sigma12 = rsqrt((numtyp)0.5*sigma12);
h12 = r-sigma12;
// -- kappa is now ok
kappa[0]*=r;
kappa[1]*=r;
kappa[2]*=r;
int mtype=mul24(ntypes,itype)+jtype;
numtyp sigma = sig_eps[mtype].x;
numtyp epsilon = sig_eps[mtype].y;
numtyp varrho = sigma/(h12+gum[0]*sigma);
numtyp varrho6 = varrho*varrho*varrho;
varrho6*=varrho6;
numtyp varrho12 = varrho6*varrho6;
u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
temp1 = temp1*(numtyp)24.0*epsilon;
uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
numtyp temp2 = gpu_dot3(kappa,r12hat);
uslj_rsq = uslj_rsq*ir*ir;
dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
}
}
}
// Compute eta
{
eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
numtyp det_g12 = gpu_det3(g12);
eta = pow(eta/det_g12,gum[1]);
}
}
numtyp chi, dchi[3];
{ // Compute chi and dchi
// Compute b12
numtyp b12[9];
{
numtyp b2[9];
gpu_diag_times3(well[jtype],a2,b12);
gpu_transpose_times3(a2,b12,b2);
b12[0]=b2[0]+one_well;
b12[4]=b2[4]+one_well;
b12[8]=b2[8]+one_well;
b12[1]=b2[1];
b12[2]=b2[2];
b12[3]=b2[3];
b12[5]=b2[5];
b12[6]=b2[6];
b12[7]=b2[7];
}
// compute chi_12
numtyp iota[3];
gpu_mldivide3(b12,r12,iota,err_flag);
// -- iota is now iota/r
iota[0]*=ir;
iota[1]*=ir;
iota[2]*=ir;
chi = gpu_dot3(r12hat,iota);
chi = pow(chi*(numtyp)2.0,gum[2]);
// -- iota is now ok
iota[0]*=r;
iota[1]*=r;
iota[2]*=r;
numtyp temp1 = gpu_dot3(iota,r12hat);
numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
gum[2]);
dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
}
numtyp temp2 = factor_lj*eta*chi;
if (eflag>0)
energy+=u_r*temp2;
numtyp temp1 = -eta*u_r*factor_lj;
if (vflag>0) {
r12[0]*=-1;
r12[1]*=-1;
r12[2]*=-1;
numtyp ft=temp1*dchi[0]-temp2*dUr[0];
f.x+=ft;
virial[0]+=r12[0]*ft;
ft=temp1*dchi[1]-temp2*dUr[1];
f.y+=ft;
virial[1]+=r12[1]*ft;
virial[3]+=r12[0]*ft;
ft=temp1*dchi[2]-temp2*dUr[2];
f.z+=ft;
virial[2]+=r12[2]*ft;
virial[4]+=r12[0]*ft;
virial[5]+=r12[1]*ft;
} else {
f.x+=temp1*dchi[0]-temp2*dUr[0];
f.y+=temp1*dchi[1]-temp2*dUr[1];
f.z+=temp1*dchi[2]-temp2*dUr[2];
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[3];
sp_lj[1]=gum[4];
sp_lj[2]=gum[5];
sp_lj[3]=gum[6];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int ii=itype*lj_types+jtype;
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
energy+=factor_lj*(e-lj3[ii].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (tid<4)
sp_lj[tid]=gum[tid+3];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
#endif

472
lib/gpu/lj_class2_long.cu Normal file
View File

@ -0,0 +1,472 @@
// **************************************************************************
// lj_class2_long.cu
// -------------------
// W. Michael Brown
//
// Device code for COMPASS LJ long acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : Mon May 16 2011
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef LJCL_GPU_KERNEL
#define LJCL_GPU_KERNEL
#ifdef NV_KERNEL
#include "nv_kernel_def.h"
texture<float4> pos_tex;
texture<float> q_tex;
#ifdef _DOUBLE_DOUBLE
__inline double4 fetch_pos(const int& i, const double4 *pos)
{
return pos[i];
}
__inline double fetch_q(const int& i, const double *q)
{
return q[i];
}
#else
__inline float4 fetch_pos(const int& i, const float4 *pos)
{
return tex1Dfetch(pos_tex, i);
}
__inline float fetch_q(const int& i, const float *q)
{
return tex1Dfetch(q_tex, i);
}
#endif
#else
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#endif
#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif
#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif
#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif
#define EWALD_F (numtyp)1.12837917
#define EWALD_P (numtyp)0.3275911
#define A1 (numtyp)0.254829592
#define A2 (numtyp)-0.284496736
#define A3 (numtyp)1.421413741
#define A4 (numtyp)-1.453152027
#define A5 (numtyp)1.061405429
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].z) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
if (rsq < lj1[mtype].w) {
numtyp rinv=sqrt(r2inv);
r3inv=r2inv*rinv;
r6inv = r3inv*r3inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=nbor_pitch;
int numj=*nbor;
nbor+=nbor_pitch;
int n_stride;
__global int *list_end;
if (dev_nbor==dev_packed) {
list_end=nbor+mul24(numj,nbor_pitch);
nbor+=mul24(offset,nbor_pitch);
n_stride=mul24(t_per_atom,nbor_pitch);
} else {
nbor=dev_packed+*nbor;
list_end=nbor+numj;
n_stride=t_per_atom;
nbor+=offset;
}
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].z) {
numtyp r2inv=(numtyp)1.0/rsq;
numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
if (rsq < lj1[mtype].w) {
numtyp rinv=sqrt(r2inv);
r3inv=r2inv*rinv;
r6inv = r3inv*r3inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = sqrt(rsq);
numtyp grij = g_ewald * r;
numtyp expm2 = exp(-grij*grij);
numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else {
forcecoul = (numtyp)0.0;
prefactor = (numtyp)0.0;
}
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < lj1[mtype].w) {
numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
red_acc[4][tid]=e_coul;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<5; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
e_coul=red_acc[4][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
*ap1=e_coul;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii*/
}
#endif

526
lib/gpu/re_squared.cu Normal file
View File

@ -0,0 +1,526 @@
// **************************************************************************
// re_squared.cu
// -------------------
// W. Michael Brown
//
// Device code for RE-Squared potential acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : Fri May 06 2011
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef RE_SQUARED_CU
#define RE_SQUARED_CU
#ifdef NV_KERNEL
#include "ellipsoid_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
{
numtyp ans;
ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] -
m[3]*m2[1]*m[8] + m[3]*m2[2]*m[7] +
m[6]*m2[1]*m[5] - m[6]*m2[2]*m[4] +
m[0]*m2[4]*m[8] - m[0]*m2[5]*m[7] -
m2[3]*m[1]*m[8] + m2[3]*m[2]*m[7] +
m[6]*m[1]*m2[5] - m[6]*m[2]*m2[4] +
m[0]*m[4]*m2[8] - m[0]*m[5]*m2[7] -
m[3]*m[1]*m2[8] + m[3]*m[2]*m2[7] +
m2[6]*m[1]*m[5] - m2[6]*m[2]*m[4];
return ans;
}
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *splj, __global numtyp2* sig_eps,
const int ntypes, __global int *dev_nbor,
const int stride, __global acctyp4 *ans,
const int astride, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=splj[0];
sp_lj[1]=splj[1];
sp_lj[2]=splj[2];
sp_lj[3]=splj[3];
__local numtyp b_alpha, cr60;
b_alpha=(numtyp)45.0/(numtyp)56.0;
cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a1[9]; // Rotation matrix (lab->body)
numtyp aTe1[9]; // A'*E
numtyp gamma1[9]; // A'*S^2*A
numtyp sa1[9]; // S^2*A;
numtyp lA1_0[9], lA1_1[9], lA1_2[9]; // -A*rotation generator (x,y, or z)
numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9]; // A'*S^2*lA
numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9]; // lAtwo+lA'*sa
numtyp4 ishape;
ishape=shape[itype];
numtyp4 ishape2;
ishape2.x=ishape.x*ishape.x;
ishape2.y=ishape.y*ishape.y;
ishape2.z=ishape.z*ishape.z;
numtyp ilshape = ishape.x*ishape.y*ishape.z;
{
numtyp aTs[9]; // A1'*S1^2
gpu_quat_to_mat_trans(q,i,a1);
gpu_transpose_times_diag3(a1,well[itype],aTe1);
gpu_transpose_times_diag3(a1,ishape2,aTs);
gpu_diag_times3(ishape2,a1,sa1);
gpu_times3(aTs,a1,gamma1);
gpu_rotation_generator_x(a1,lA1_0);
gpu_rotation_generator_y(a1,lA1_1);
gpu_rotation_generator_z(a1,lA1_2);
gpu_times3(aTs,lA1_0,lAtwo1_0);
gpu_transpose_times3(lA1_0,sa1,lAsa1_0);
gpu_plus3(lAsa1_0,lAtwo1_0,lAsa1_0);
gpu_times3(aTs,lA1_1,lAtwo1_1);
gpu_transpose_times3(lA1_1,sa1,lAsa1_1);
gpu_plus3(lAsa1_1,lAtwo1_1,lAsa1_1);
gpu_times3(aTs,lA1_2,lAtwo1_2);
gpu_transpose_times3(lA1_2,sa1,lAsa1_2);
gpu_plus3(lAsa1_2,lAtwo1_2,lAsa1_2);
}
ishape2.x=(numtyp)1.0/ishape2.x;
ishape2.y=(numtyp)1.0/ishape2.y;
ishape2.z=(numtyp)1.0/ishape2.z;
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r[3], rhat[3];
numtyp rnorm;
r[0] = jx.x-ix.x;
r[1] = jx.y-ix.y;
r[2] = jx.z-ix.z;
rnorm = gpu_dot3(r,r);
rnorm = rsqrt(rnorm);
rhat[0] = r[0]*rnorm;
rhat[1] = r[1]*rnorm;
rhat[2] = r[2]*rnorm;
numtyp a2[9]; // Rotation matrix (lab->body)
numtyp gamma2[9]; // A'*S^2*A
numtyp4 jshape;
jshape=shape[jtype];
numtyp4 jshape2;
jshape2.x=jshape.x*jshape.x;
jshape2.y=jshape.y*jshape.y;
jshape2.z=jshape.z*jshape.z;
{
numtyp aTs[9]; // A1'*S1^2
gpu_quat_to_mat_trans(q,j,a2);
gpu_transpose_times_diag3(a2,jshape2,aTs);
gpu_times3(aTs,a2,gamma2);
}
numtyp temp[9], s[3], z1[3], z2[3], v1[3], v2[3];
numtyp sigma12, sigma1, sigma2;
gpu_plus3(gamma1,gamma2,temp);
gpu_mldivide3(temp,rhat,s,err_flag);
sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
gpu_times_column3(a1,rhat,z1);
gpu_times_column3(a2,rhat,z2);
v1[0] = z1[0]*ishape2.x;
v1[1] = z1[1]*ishape2.y;
v1[2] = z1[2]*ishape2.z;
v2[0] = z2[0]/jshape2.x;
v2[1] = z2[1]/jshape2.y;
v2[2] = z2[2]/jshape2.z;
sigma1 = sqrt(gpu_dot3(z1,v1));
sigma2 = sqrt(gpu_dot3(z2,v2));
numtyp H12[9];
numtyp dH;
H12[0] = gamma1[0]*sigma1+gamma2[0]*sigma2;
H12[1] = gamma1[1]*sigma1+gamma2[1]*sigma2;
H12[2] = gamma1[2]*sigma1+gamma2[2]*sigma2;
H12[3] = gamma1[3]*sigma1+gamma2[3]*sigma2;
H12[4] = gamma1[4]*sigma1+gamma2[4]*sigma2;
H12[5] = gamma1[5]*sigma1+gamma2[5]*sigma2;
H12[6] = gamma1[6]*sigma1+gamma2[6]*sigma2;
H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2;
H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2;
dH=gpu_det3(H12);
numtyp sigma1p2, sigma2p2, lambda, nu;
sigma1p2 = sigma1*sigma1;
sigma2p2 = sigma2*sigma2;
numtyp jlshape = jshape.x*jshape.y*jshape.z;
lambda = ilshape*sigma1p2 + jlshape*sigma2p2;
sigma1=(numtyp)1.0/sigma1;
sigma2=(numtyp)1.0/sigma2;
nu = sqrt(dH/(sigma1+sigma2));
gpu_times3(aTe1,a1,temp);
numtyp sigma, epsilon;
int mtype=mul24(ntypes,itype)+jtype;
sigma = sig_eps[mtype].x;
epsilon = sig_eps[mtype].y*factor_lj;
numtyp w[3], temp2[9];
numtyp h12,eta,chi,sprod,sigh,tprod;
numtyp aTe2[9]; // A'*E
gpu_transpose_times_diag3(a2,well[jtype],aTe2);
gpu_times3(aTe2,a2,temp2);
gpu_plus3(temp,temp2,temp);
gpu_mldivide3(temp,rhat,w,err_flag);
h12 = (numtyp)1.0/rnorm-sigma12;
eta = lambda/nu;
chi = (numtyp)2.0*gpu_dot3(rhat,w);
sprod = ilshape * jlshape;
sigh = sigma/h12;
tprod = eta*chi*sigh;
numtyp stemp, Ua;
stemp = h12*(numtyp)0.5;
Ua = (ishape.x+stemp)*(ishape.y+stemp)*
(ishape.z+stemp)*(jshape.x+stemp)*
(jshape.y+stemp)*(jshape.z+stemp);
Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*sprod/Ua;
Ua = epsilon*Ua/(numtyp)-36.0;
numtyp Ur;
stemp = h12/cr60;
Ur = (ishape.x+stemp)*(ishape.y+stemp)*
(ishape.z+stemp)*(jshape.x+stemp)*
(jshape.y+stemp)*(jshape.z+stemp);
Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur;
numtyp sigh6=sigh*sigh*sigh;
sigh6*=sigh6;
Ur = epsilon*Ur*sigh6/(numtyp)2025.0;
energy+=Ua+Ur;
// force
numtyp vsigma1[3], vsigma2[3], gsigma1[9], gsigma2[9];
numtyp sec, sigma12p3, sigma1p3, sigma2p3;
sec = sigma*eta*chi;
sigma12p3 = sigma12*sigma12*sigma12;
sigma1p3 = sigma1/sigma1p2;
sigma2p3 = sigma2/sigma2p2;
vsigma1[0] = -sigma1p3*v1[0];
vsigma1[1] = -sigma1p3*v1[1];
vsigma1[2] = -sigma1p3*v1[2];
vsigma2[0] = -sigma2p3*v2[0];
vsigma2[1] = -sigma2p3*v2[1];
vsigma2[2] = -sigma2p3*v2[2];
gsigma1[0] = -gamma1[0]*sigma1p2;
gsigma1[1] = -gamma1[1]*sigma1p2;
gsigma1[2] = -gamma1[2]*sigma1p2;
gsigma1[3] = -gamma1[3]*sigma1p2;
gsigma1[4] = -gamma1[4]*sigma1p2;
gsigma1[5] = -gamma1[5]*sigma1p2;
gsigma1[6] = -gamma1[6]*sigma1p2;
gsigma1[7] = -gamma1[7]*sigma1p2;
gsigma1[8] = -gamma1[8]*sigma1p2;
gsigma2[0] = -gamma2[0]*sigma2p2;
gsigma2[1] = -gamma2[1]*sigma2p2;
gsigma2[2] = -gamma2[2]*sigma2p2;
gsigma2[3] = -gamma2[3]*sigma2p2;
gsigma2[4] = -gamma2[4]*sigma2p2;
gsigma2[5] = -gamma2[5]*sigma2p2;
gsigma2[6] = -gamma2[6]*sigma2p2;
gsigma2[7] = -gamma2[7]*sigma2p2;
gsigma2[8] = -gamma2[8]*sigma2p2;
numtyp tsig1sig2, tdH, teta1, teta2;
numtyp fourw[3], spr[3];
tsig1sig2 = eta/((numtyp)2.0*(sigma1+sigma2));
tdH = eta/((numtyp)2.0*dH);
teta1 = (numtyp)2.0*eta/lambda;
teta2 = teta1*jlshape/sigma2p3;
teta1 = teta1*ilshape/sigma1p3;
fourw[0] = (numtyp)4.0*w[0];
fourw[1] = (numtyp)4.0*w[1];
fourw[2] = (numtyp)4.0*w[2];
spr[0] = (numtyp)0.5*sigma12p3*s[0];
spr[1] = (numtyp)0.5*sigma12p3*s[1];
spr[2] = (numtyp)0.5*sigma12p3*s[2];
numtyp hsec, dspu, pbsu;
stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
(numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
(numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
(numtyp)1.0/(jshape.x*(numtyp)2.0+h12)+
(numtyp)1.0/(jshape.y*(numtyp)2.0+h12)+
(numtyp)1.0/(jshape.z*(numtyp)2.0+h12);
hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
dspu = (numtyp)1.0/h12-hsec+stemp;
pbsu = (numtyp)3.0*sigma*hsec;
numtyp dspr, pbsr;
stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
(numtyp)1.0/(ishape.y*cr60+h12)+
(numtyp)1.0/(ishape.z*cr60+h12)+
(numtyp)1.0/(jshape.x*cr60+h12)+
(numtyp)1.0/(jshape.y*cr60+h12)+
(numtyp)1.0/(jshape.z*cr60+h12);
hsec = (numtyp)1.0/(h12+b_alpha*sec);
dspr = (numtyp)7.0/h12-hsec+stemp;
pbsr = b_alpha*sigma*hsec;
numtyp dH12[9];
numtyp dUa, dUr, deta, dchi, ddH, dh12;
numtyp dsigma1, dsigma2;
#pragma unroll
for (int i=0; i<3; i++) {
numtyp u[3], u1[3], u2[3];
u[0] = -rhat[i]*rhat[0];
u[1] = -rhat[i]*rhat[1];
u[2] = -rhat[i]*rhat[2];
u[i] += (numtyp)1.0;
u[0] *= rnorm;
u[1] *= rnorm;
u[2] *= rnorm;
gpu_times_column3(a1,u,u1);
gpu_times_column3(a2,u,u2);
dsigma1=gpu_dot3(u1,vsigma1);
dsigma2=gpu_dot3(u2,vsigma2);
dH12[0] = dsigma1*gsigma1[0]+dsigma2*gsigma2[0];
dH12[1] = dsigma1*gsigma1[1]+dsigma2*gsigma2[1];
dH12[2] = dsigma1*gsigma1[2]+dsigma2*gsigma2[2];
dH12[3] = dsigma1*gsigma1[3]+dsigma2*gsigma2[3];
dH12[4] = dsigma1*gsigma1[4]+dsigma2*gsigma2[4];
dH12[5] = dsigma1*gsigma1[5]+dsigma2*gsigma2[5];
dH12[6] = dsigma1*gsigma1[6]+dsigma2*gsigma2[6];
dH12[7] = dsigma1*gsigma1[7]+dsigma2*gsigma2[7];
dH12[8] = dsigma1*gsigma1[8]+dsigma2*gsigma2[8];
ddH = det_prime(H12,dH12);
deta = (dsigma1+dsigma2)*tsig1sig2;
deta -= ddH*tdH;
deta -= dsigma1*teta1+dsigma2*teta2;
dchi = gpu_dot3(u,fourw);
dh12 = rhat[i]+gpu_dot3(u,spr);
dUa = pbsu*(eta*dchi+deta*chi)-dh12*dspu;
dUr = pbsr*(eta*dchi+deta*chi)-dh12*dspr;
numtyp force=dUr*Ur+dUa*Ua;
if (i==0) {
f.x+=force;
if (vflag>0)
virial[0]+=-r[0]*force;
} else if (i==1) {
f.y+=force;
if (vflag>0) {
virial[1]+=-r[1]*force;
virial[3]+=-r[0]*force;
}
} else {
f.z+=force;
if (vflag>0) {
virial[2]+=-r[2]*force;
virial[4]+=-r[0]*force;
virial[5]+=-r[1]*force;
}
}
}
// torque on i
sigma1=(numtyp)1.0/sigma1;
numtyp fwae[3], p[3];
gpu_row_times3(fourw,aTe1,fwae);
{
gpu_times_column3(lA1_0,rhat,p);
dsigma1 = gpu_dot3(p,vsigma1);
dH12[0] = lAsa1_0[0]*sigma1+dsigma1*gsigma1[0];
dH12[1] = lAsa1_0[1]*sigma1+dsigma1*gsigma1[1];
dH12[2] = lAsa1_0[2]*sigma1+dsigma1*gsigma1[2];
dH12[3] = lAsa1_0[3]*sigma1+dsigma1*gsigma1[3];
dH12[4] = lAsa1_0[4]*sigma1+dsigma1*gsigma1[4];
dH12[5] = lAsa1_0[5]*sigma1+dsigma1*gsigma1[5];
dH12[6] = lAsa1_0[6]*sigma1+dsigma1*gsigma1[6];
dH12[7] = lAsa1_0[7]*sigma1+dsigma1*gsigma1[7];
dH12[8] = lAsa1_0[8]*sigma1+dsigma1*gsigma1[8];
ddH = det_prime(H12,dH12);
deta = tsig1sig2*dsigma1-tdH*ddH;
deta -= teta1*dsigma1;
numtyp tempv[3];
gpu_times_column3(lA1_0,w,tempv);
dchi = -gpu_dot3(fwae,tempv);
gpu_times_column3(lAtwo1_0,spr,tempv);
dh12 = -gpu_dot3(s,tempv);
dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
tor.x -= (dUa*Ua+dUr*Ur);
}
{
gpu_times_column3(lA1_1,rhat,p);
dsigma1 = gpu_dot3(p,vsigma1);
dH12[0] = lAsa1_1[0]*sigma1+dsigma1*gsigma1[0];
dH12[1] = lAsa1_1[1]*sigma1+dsigma1*gsigma1[1];
dH12[2] = lAsa1_1[2]*sigma1+dsigma1*gsigma1[2];
dH12[3] = lAsa1_1[3]*sigma1+dsigma1*gsigma1[3];
dH12[4] = lAsa1_1[4]*sigma1+dsigma1*gsigma1[4];
dH12[5] = lAsa1_1[5]*sigma1+dsigma1*gsigma1[5];
dH12[6] = lAsa1_1[6]*sigma1+dsigma1*gsigma1[6];
dH12[7] = lAsa1_1[7]*sigma1+dsigma1*gsigma1[7];
dH12[8] = lAsa1_1[8]*sigma1+dsigma1*gsigma1[8];
ddH = det_prime(H12,dH12);
deta = tsig1sig2*dsigma1-tdH*ddH;
deta -= teta1*dsigma1;
numtyp tempv[3];
gpu_times_column3(lA1_1,w,tempv);
dchi = -gpu_dot3(fwae,tempv);
gpu_times_column3(lAtwo1_1,spr,tempv);
dh12 = -gpu_dot3(s,tempv);
dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
tor.y -= (dUa*Ua+dUr*Ur);
}
{
gpu_times_column3(lA1_2,rhat,p);
dsigma1 = gpu_dot3(p,vsigma1);
dH12[0] = lAsa1_2[0]*sigma1+dsigma1*gsigma1[0];
dH12[1] = lAsa1_2[1]*sigma1+dsigma1*gsigma1[1];
dH12[2] = lAsa1_2[2]*sigma1+dsigma1*gsigma1[2];
dH12[3] = lAsa1_2[3]*sigma1+dsigma1*gsigma1[3];
dH12[4] = lAsa1_2[4]*sigma1+dsigma1*gsigma1[4];
dH12[5] = lAsa1_2[5]*sigma1+dsigma1*gsigma1[5];
dH12[6] = lAsa1_2[6]*sigma1+dsigma1*gsigma1[6];
dH12[7] = lAsa1_2[7]*sigma1+dsigma1*gsigma1[7];
dH12[8] = lAsa1_2[8]*sigma1+dsigma1*gsigma1[8];
ddH = det_prime(H12,dH12);
deta = tsig1sig2*dsigma1-tdH*ddH;
deta -= teta1*dsigma1;
numtyp tempv[3];
gpu_times_column3(lA1_2,w,tempv);
dchi = -gpu_dot3(fwae,tempv);
gpu_times_column3(lAtwo1_2,spr,tempv);
dh12 = -gpu_dot3(s,tempv);
dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
tor.z -= (dUa*Ua+dUr*Ur);
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=tor.x;
red_acc[4][tid]=tor.y;
red_acc[5][tid]=tor.z;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
tor.x=red_acc[3][tid];
tor.y=red_acc[4][tid];
tor.z=red_acc[5][tid];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=astride;
}
}
ans[ii]=f;
ans[ii+astride]=tor;
} // if ii
}
#endif

892
lib/gpu/re_squared_lj.cu Normal file
View File

@ -0,0 +1,892 @@
// **************************************************************************
// re_squared_lj.cu
// -------------------
// W. Michael Brown
//
// Device code for RE-Squared - Lennard-Jones potential acceleration
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : Fri May 06 2011
// email : brownw@ornl.gov
// ***************************************************************************/
#ifndef RE_SQUARED_LJ_CU
#define RE_SQUARED_LJ_CU
#ifdef NV_KERNEL
#include "ellipsoid_extra.h"
#endif
#define SBBITS 30
#define NEIGHMASK 0x3FFFFFFF
__inline int sbmask(int j) { return j >> SBBITS & 3; }
__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *splj, __global numtyp2* sig_eps,
const int ntypes, __global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=splj[0];
sp_lj[1]=splj[1];
sp_lj[2]=splj[2];
sp_lj[3]=splj[3];
__local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
b_alpha=(numtyp)45.0/(numtyp)56.0;
cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);
solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a[9]; // Rotation matrix (lab->body)
numtyp aTe[9]; // A'*E
numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z)
numtyp4 ishape;
ishape=shape[itype];
numtyp ilshape=ishape.x*ishape.y*ishape.z;
{
gpu_quat_to_mat_trans(q,i,a);
gpu_transpose_times_diag3(a,well[itype],aTe);
gpu_rotation_generator_x(a,lA_0);
gpu_rotation_generator_y(a,lA_1);
gpu_rotation_generator_z(a,lA_2);
}
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp r[3], rhat[3];
numtyp rnorm;
r[0] = jx.x-ix.x;
r[1] = jx.y-ix.y;
r[2] = jx.z-ix.z;
rnorm = gpu_dot3(r,r);
rnorm = rsqrt(rnorm);
rhat[0] = r[0]*rnorm;
rhat[1] = r[1]*rnorm;
rhat[2] = r[2]*rnorm;
numtyp sigma, epsilon;
int mtype=mul24(ntypes,itype)+jtype;
sigma = sig_eps[mtype].x;
epsilon = sig_eps[mtype].y*factor_lj;
numtyp aTs[9];
numtyp4 scorrect;
numtyp half_sigma=sigma*(numtyp)0.5;
scorrect.x = ishape.x+half_sigma;
scorrect.y = ishape.y+half_sigma;
scorrect.z = ishape.z+half_sigma;
scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
gpu_transpose_times_diag3(a,scorrect,aTs);
// energy
numtyp gamma[9], s[3];
gpu_times3(aTs,a,gamma);
gpu_mldivide3(gamma,rhat,s,err_flag);
numtyp sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
numtyp temp[9], w[3];
gpu_times3(aTe,a,temp);
temp[0] += (numtyp)1.0;
temp[4] += (numtyp)1.0;
temp[8] += (numtyp)1.0;
gpu_mldivide3(temp,rhat,w,err_flag);
numtyp h12 = (numtyp)1.0/rnorm-sigma12;
numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
numtyp sigh = sigma/h12;
numtyp tprod = chi*sigh;
numtyp Ua, Ur;
numtyp h12p3 = h12*h12*h12;
numtyp sigmap3 = sigma*sigma*sigma;
numtyp stemp = h12*(numtyp)0.5;
Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
Ua = epsilon*Ua*sigmap3*solv_f_a;
stemp = h12/cr60;
Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
(numtyp)60.0;
Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
numtyp sigh6=sigh*sigh*sigh;
sigh6*=sigh6;
Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
energy+=Ua+Ur;
// force
numtyp fourw[3], spr[3];
numtyp sec = sigma*chi;
numtyp sigma12p3 = sigma12*sigma12*sigma12;
fourw[0] = (numtyp)4.0*w[0];
fourw[1] = (numtyp)4.0*w[1];
fourw[2] = (numtyp)4.0*w[2];
spr[0] = (numtyp)0.5*sigma12p3*s[0];
spr[1] = (numtyp)0.5*sigma12p3*s[1];
spr[2] = (numtyp)0.5*sigma12p3*s[2];
stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
(numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
(numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
(numtyp)3.0/h12;
numtyp hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
numtyp dspu = (numtyp)1.0/h12-hsec+stemp;
numtyp pbsu = (numtyp)3.0*sigma*hsec;
stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
(numtyp)1.0/(ishape.y*cr60+h12)+
(numtyp)1.0/(ishape.z*cr60+h12)+
(numtyp)3.0/h12;
hsec = (numtyp)1.0/(h12+b_alpha*sec);
numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
numtyp pbsr = b_alpha*sigma*hsec;
#pragma unroll
for (int i=0; i<3; i++) {
numtyp u[3];
u[0] = -rhat[i]*rhat[0];
u[1] = -rhat[i]*rhat[1];
u[2] = -rhat[i]*rhat[2];
u[i] += (numtyp)1.0;
u[0] *= rnorm;
u[1] *= rnorm;
u[2] *= rnorm;
numtyp dchi = gpu_dot3(u,fourw);
numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
numtyp dUa = pbsu*dchi-dh12*dspu;
numtyp dUr = pbsr*dchi-dh12*dspr;
numtyp force=dUr*Ur+dUa*Ua;
if (i==0) {
f.x+=force;
if (vflag>0)
virial[0]+=-r[0]*force;
} else if (i==1) {
f.y+=force;
if (vflag>0) {
virial[1]+=-r[1]*force;
virial[3]+=-r[0]*force;
}
} else {
f.z+=force;
if (vflag>0) {
virial[2]+=-r[2]*force;
virial[4]+=-r[0]*force;
virial[5]+=-r[1]*force;
}
}
}
// torque on i
numtyp fwae[3];
gpu_row_times3(fourw,aTe,fwae);
{
numtyp tempv[3], p[3], lAtwo[9];
gpu_times_column3(lA_0,rhat,p);
gpu_times_column3(lA_0,w,tempv);
numtyp dchi = -gpu_dot3(fwae,tempv);
gpu_times3(aTs,lA_0,lAtwo);
gpu_times_column3(lAtwo,spr,tempv);
numtyp dh12 = -gpu_dot3(s,tempv);
numtyp dUa = pbsu*dchi-dh12*dspu;
numtyp dUr = pbsr*dchi-dh12*dspr;
tor.x -= (dUa*Ua+dUr*Ur);
}
{
numtyp tempv[3], p[3], lAtwo[9];
gpu_times_column3(lA_1,rhat,p);
gpu_times_column3(lA_1,w,tempv);
numtyp dchi = -gpu_dot3(fwae,tempv);
gpu_times3(aTs,lA_1,lAtwo);
gpu_times_column3(lAtwo,spr,tempv);
numtyp dh12 = -gpu_dot3(s,tempv);
numtyp dUa = pbsu*dchi-dh12*dspu;
numtyp dUr = pbsr*dchi-dh12*dspr;
tor.y -= (dUa*Ua+dUr*Ur);
}
{
numtyp tempv[3], p[3], lAtwo[9];
gpu_times_column3(lA_2,rhat,p);
gpu_times_column3(lA_2,w,tempv);
numtyp dchi = -gpu_dot3(fwae,tempv);
gpu_times3(aTs,lA_2,lAtwo);
gpu_times_column3(lAtwo,spr,tempv);
numtyp dh12 = -gpu_dot3(s,tempv);
numtyp dUa = pbsu*dchi-dh12*dspu;
numtyp dUr = pbsr*dchi-dh12*dspr;
tor.z -= (dUa*Ua+dUr*Ur);
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=tor.x;
red_acc[4][tid]=tor.y;
red_acc[5][tid]=tor.z;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
tor.x=red_acc[3][tid];
tor.y=red_acc[4][tid];
tor.z=red_acc[5][tid];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=astride;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=astride;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
old=ans[ii+astride];
old.x+=tor.x;
old.y+=tor.y;
old.z+=tor.z;
ans[ii+astride]=old;
} // if ii
}
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
__global numtyp4* shape,__global numtyp4* well,
__global numtyp *splj, __global numtyp2* sig_eps,
const int ntypes, __global int *dev_nbor,
const int stride, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag,const int start,
const int inum, const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=splj[0];
sp_lj[1]=splj[1];
sp_lj[2]=splj[2];
sp_lj[3]=splj[3];
__local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
b_alpha=(numtyp)45.0/(numtyp)56.0;
cr60=pow((numtyp)60.0,(numtyp)1.0/(numtyp)3.0);
solv_f_a = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*-(numtyp)36.0);
solv_f_r = (numtyp)3.0/((numtyp)16.0*atan((numtyp)1.0)*(numtyp)2025.0);
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_nbor+ii;
int j=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *nbor_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 jx=x_[j];
int jtype=jx.w;
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int i=*nbor;
factor_lj = sp_lj[sbmask(i)];
i &= NEIGHMASK;
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp a[9]; // Rotation matrix (lab->body)
numtyp aTe[9]; // A'*E
numtyp4 ishape;
ishape=shape[itype];
gpu_quat_to_mat_trans(q,i,a);
gpu_transpose_times_diag3(a,well[itype],aTe);
// Compute r12
numtyp r[3], rhat[3];
numtyp rnorm;
r[0] = ix.x-jx.x;
r[1] = ix.y-jx.y;
r[2] = ix.z-jx.z;
rnorm = gpu_dot3(r,r);
rnorm = rsqrt(rnorm);
rhat[0] = r[0]*rnorm;
rhat[1] = r[1]*rnorm;
rhat[2] = r[2]*rnorm;
numtyp sigma, epsilon;
int mtype=mul24(ntypes,itype)+jtype;
sigma = sig_eps[mtype].x;
epsilon = sig_eps[mtype].y*factor_lj;
numtyp aTs[9];
numtyp4 scorrect;
numtyp half_sigma=sigma * (numtyp)0.5;
scorrect.x = ishape.x+half_sigma;
scorrect.y = ishape.y+half_sigma;
scorrect.z = ishape.z+half_sigma;
scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
gpu_transpose_times_diag3(a,scorrect,aTs);
// energy
numtyp gamma[9], s[3];
gpu_times3(aTs,a,gamma);
gpu_mldivide3(gamma,rhat,s,err_flag);
numtyp sigma12 = rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
numtyp temp[9], w[3];
gpu_times3(aTe,a,temp);
temp[0] += (numtyp)1.0;
temp[4] += (numtyp)1.0;
temp[8] += (numtyp)1.0;
gpu_mldivide3(temp,rhat,w,err_flag);
numtyp h12 = (numtyp)1.0/rnorm-sigma12;
numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
numtyp sigh = sigma/h12;
numtyp tprod = chi*sigh;
numtyp Ua, Ur;
numtyp h12p3 = h12*h12*h12;
numtyp sigmap3 = sigma*sigma*sigma;
numtyp stemp = h12/(numtyp)2.0;
Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
numtyp ilshape=ishape.x*ishape.y*ishape.z;
Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
Ua = epsilon*Ua*sigmap3*solv_f_a;
stemp = h12/cr60;
Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
(numtyp)60.0;
Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
numtyp sigh6=sigh*sigh*sigh;
sigh6*=sigh6;
Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
energy+=Ua+Ur;
// force
numtyp fourw[3], spr[3];
numtyp sec = sigma*chi;
numtyp sigma12p3 = sigma12*sigma12*sigma12;
fourw[0] = (numtyp)4.0*w[0];
fourw[1] = (numtyp)4.0*w[1];
fourw[2] = (numtyp)4.0*w[2];
spr[0] = (numtyp)0.5*sigma12p3*s[0];
spr[1] = (numtyp)0.5*sigma12p3*s[1];
spr[2] = (numtyp)0.5*sigma12p3*s[2];
stemp = (numtyp)1.0/(ishape.x*(numtyp)2.0+h12)+
(numtyp)1.0/(ishape.y*(numtyp)2.0+h12)+
(numtyp)1.0/(ishape.z*(numtyp)2.0+h12)+
(numtyp)3.0/h12;
numtyp hsec = (numtyp)1.0/(h12+(numtyp)3.0*sec);
numtyp dspu = (numtyp)1.0/h12-hsec+stemp;
numtyp pbsu = (numtyp)3.0*sigma*hsec;
stemp = (numtyp)1.0/(ishape.x*cr60+h12)+
(numtyp)1.0/(ishape.y*cr60+h12)+
(numtyp)1.0/(ishape.z*cr60+h12)+
(numtyp)3.0/h12;
hsec = (numtyp)1.0/(h12+b_alpha*sec);
numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
numtyp pbsr = b_alpha*sigma*hsec;
#pragma unroll
for (int i=0; i<3; i++) {
numtyp u[3];
u[0] = -rhat[i]*rhat[0];
u[1] = -rhat[i]*rhat[1];
u[2] = -rhat[i]*rhat[2];
u[i] += (numtyp)1.0;
u[0] *= rnorm;
u[1] *= rnorm;
u[2] *= rnorm;
numtyp dchi = gpu_dot3(u,fourw);
numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
numtyp dUa = pbsu*dchi-dh12*dspu;
numtyp dUr = pbsr*dchi-dh12*dspr;
numtyp force=dUr*Ur+dUa*Ua;
if (i==0) {
f.x+=force;
if (vflag>0)
virial[0]+=-r[0]*force;
} else if (i==1) {
f.y+=force;
if (vflag>0) {
virial[1]+=-r[1]*force;
virial[3]+=-r[0]*force;
}
} else {
f.z+=force;
if (vflag>0) {
virial[2]+=-r[2]*force;
virial[4]+=-r[0]*force;
virial[5]+=-r[1]*force;
}
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[7][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<3; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
if (eflag>0 || vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
red_acc[6][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<7; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
energy=red_acc[6][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1=virial[i];
ap1+=inum;
}
}
ans[ii]=f;
} // if ii
}
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
sp_lj[0]=gum[0];
sp_lj[1]=gum[1];
sp_lj[2]=gum[2];
sp_lj[3]=gum[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int ii=itype*lj_types+jtype;
if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
energy+=factor_lj*(e-lj3[ii].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
int tid=THREAD_ID_X;
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
ii+=tid/t_per_atom+start;
int offset=tid%t_per_atom;
__local numtyp sp_lj[4];
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
if (tid<4)
sp_lj[tid]=gum[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0;
f.y=(acctyp)0;
f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor=dev_ij+ii;
int i=*nbor;
nbor+=stride;
int numj=*nbor;
nbor+=stride;
__global int *list_end=nbor+mul24(stride,numj);
nbor+=mul24(offset,stride);
int n_stride=mul24(t_per_atom,stride);
numtyp4 ix=x_[i];
int iw=ix.w;
int itype=mul24((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
r2inv=(numtyp)1.0/r2inv;
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
// Reduce answers
if (t_per_atom>1) {
__local acctyp red_acc[6][BLOCK_PAIR];
red_acc[0][tid]=f.x;
red_acc[1][tid]=f.y;
red_acc[2][tid]=f.z;
red_acc[3][tid]=energy;
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<4; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
f.x=red_acc[0][tid];
f.y=red_acc[1][tid];
f.z=red_acc[2][tid];
energy=red_acc[3][tid];
if (vflag>0) {
for (int r=0; r<6; r++)
red_acc[r][tid]=virial[r];
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
if (offset < s) {
for (int r=0; r<6; r++)
red_acc[r][tid] += red_acc[r][tid+s];
}
}
for (int r=0; r<6; r++)
virial[r]=red_acc[r][tid];
}
}
// Store answers
if (ii<inum && offset==0) {
__global acctyp *ap1=engv+ii;
if (eflag>0) {
*ap1+=energy;
ap1+=inum;
}
if (vflag>0) {
for (int i=0; i<6; i++) {
*ap1+=virial[i];
ap1+=inum;
}
}
acctyp4 old=ans[ii];
old.x+=f.x;
old.y+=f.y;
old.z+=f.z;
ans[ii]=old;
} // if ii
}
#endif

13
lib/gpu/replace_code.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/tcsh
set files = `echo *.h *.cpp *.cu`
rm -rf /tmp/cpp5678
mkdir /tmp/cpp5678
mkdir /tmp/cpp5678/lgpu
foreach file ( $files )
# /bin/cp $file /tmp/cpp5678/$file:t:t
# ------ Sed Replace
sed -i 's/atom->dev_engv/ans->dev_engv/g' $file
end