forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12588 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
4bb43ca885
commit
621fa7d600
|
@ -6,7 +6,7 @@ precision ?= 1
|
|||
verbose ?= 1
|
||||
|
||||
#GPU architecture (compute capability): 13, 20, 21, 35
|
||||
arch ?= 21
|
||||
arch ?= 20
|
||||
|
||||
#Using cufft (should not be changed)
|
||||
cufft ?= 1
|
||||
|
|
|
@ -85,15 +85,15 @@ void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
{
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
|
||||
|
||||
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
|
||||
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*));
|
||||
|
||||
if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*));
|
||||
|
||||
|
@ -121,9 +121,9 @@ void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");)
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
|
||||
cudaThreadSynchronize();
|
||||
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");)
|
||||
|
@ -143,14 +143,14 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
int size = (n * n_data_items) * sizeof(X_FLOAT);
|
||||
int size = (n * n_data_items) * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -185,8 +185,8 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
|
||||
|
||||
if(not sdata->overlap_comm)
|
||||
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
|
@ -216,16 +216,16 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in
|
|||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
int size = (n * n_data_items) * sizeof(X_FLOAT);
|
||||
int size = (n * n_data_items) * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -276,7 +276,7 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void
|
|||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
int size = (n * n_data_items) * sizeof(X_FLOAT);
|
||||
int size = (n * n_data_items) * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
@ -289,7 +289,7 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void
|
|||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
|
@ -463,14 +463,14 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v
|
|||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
|
||||
int size = nsend * n_data_items * sizeof(X_FLOAT);
|
||||
int size = nsend * n_data_items * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -522,14 +522,14 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap,
|
|||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
|
||||
int size = n * n_data_items * sizeof(X_FLOAT);
|
||||
int size = n * n_data_items * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -584,7 +584,7 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi
|
|||
|
||||
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
|
||||
|
||||
int size = n * n_data_items * sizeof(X_FLOAT);
|
||||
int size = n * n_data_items * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
extern __shared__ int shared[];
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
|
||||
__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -40,44 +40,44 @@ __global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxli
|
|||
int k = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
((X_FLOAT*) buffer)[i + k * n] = _x[j] + dx;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _x[j] + dx;
|
||||
k++;
|
||||
((X_FLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
|
||||
k++;
|
||||
((X_FLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
((X_FLOAT*) buffer)[i + k * n] = _v[j];
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _v[j];
|
||||
k++;
|
||||
((X_FLOAT*) buffer)[i + k * n] = _v[j + _nmax];
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _v[j + _nmax];
|
||||
k++;
|
||||
((X_FLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
((X_FLOAT*) buffer)[i + k * n] = _omega[j];
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _omega[j];
|
||||
k++;
|
||||
((X_FLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
|
||||
k++;
|
||||
((X_FLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
|
||||
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & RADIUS_MASK)((X_FLOAT*) buffer)[i + k * n] = _radius[j];
|
||||
if(data_mask & RADIUS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _radius[j];
|
||||
|
||||
k++;
|
||||
|
||||
if(data_mask & RMASS_MASK)((X_FLOAT*) buffer)[i + k * n] = _rmass[j];
|
||||
if(data_mask & RMASS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _rmass[j];
|
||||
|
||||
k++;
|
||||
}
|
||||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
|
||||
__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
@ -121,37 +121,37 @@ __global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffe
|
|||
int k = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i + first] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_x[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i + first] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_v[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i + first] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_omega[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_omega[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_omega[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
_omega[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
k++;
|
||||
}
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
|
||||
k++;
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) buffer)[i + k * n];
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
|
||||
|
||||
k++;
|
||||
}
|
||||
|
@ -163,8 +163,8 @@ __global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim)
|
|||
double* buf = (double*) _buffer;
|
||||
buf = &buf[1];
|
||||
|
||||
//X_FLOAT lo=slablo[iswap];
|
||||
//X_FLOAT hi=slabhi[iswap];
|
||||
//X_CFLOAT lo=slablo[iswap];
|
||||
//X_CFLOAT hi=slabhi[iswap];
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
bool add = false;
|
||||
|
@ -369,7 +369,7 @@ __global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int*
|
|||
}
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz)
|
||||
__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -379,37 +379,37 @@ __global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int max
|
|||
int m = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j];
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _tag[j];
|
||||
if(data_mask & TAG_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _tag[j];
|
||||
|
||||
if(data_mask & TYPE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _type[j];
|
||||
if(data_mask & TYPE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _type[j];
|
||||
|
||||
if(data_mask & MASK_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _mask[j];
|
||||
if(data_mask & MASK_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _mask[j];
|
||||
|
||||
if(data_mask & Q_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _q[j];
|
||||
if(data_mask & Q_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _q[j];
|
||||
|
||||
if(data_mask & MOLECULE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
|
||||
if(data_mask & MOLECULE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
|
||||
|
||||
if(data_mask & RADIUS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _radius[i];
|
||||
if(data_mask & RADIUS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _radius[i];
|
||||
|
||||
if(data_mask & DENSITY_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _density[i];
|
||||
if(data_mask & DENSITY_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _density[i];
|
||||
|
||||
if(data_mask & RMASS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
|
||||
if(data_mask & RMASS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i];
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
|
||||
((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
|
||||
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -417,7 +417,7 @@ __global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int max
|
|||
|
||||
|
||||
template <const unsigned int data_mask>
|
||||
__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
|
||||
__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -471,37 +471,37 @@ __global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first)
|
|||
int m = 0;
|
||||
|
||||
if(data_mask & X_MASK) {
|
||||
_x[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_x[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_x[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
}
|
||||
|
||||
if(data_mask & V_MASK) {
|
||||
_v[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_v[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_v[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_v[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_v[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
}
|
||||
|
||||
if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
|
||||
if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
|
||||
if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
|
||||
if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & Q_MASK) _q[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
if(data_mask & Q_MASK) _q[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
|
||||
if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
|
||||
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & DENSITY_MASK) _density[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
if(data_mask & DENSITY_MASK) _density[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
|
||||
if(data_mask & OMEGA_MASK) {
|
||||
_omega[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_omega[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_omega[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
|
||||
_omega[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_omega[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
|
||||
}
|
||||
} else {
|
||||
_flag[0] = 1;
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
|
||||
void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n)
|
||||
{
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -53,9 +53,9 @@ void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
{
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,7 @@ void Cuda_CommCuda_Init(cuda_shared_data* sdata)
|
|||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
int ntypesp = sdata->atom.ntypes + 1;
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &ntypesp, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata, sizeof(int*));
|
||||
}
|
||||
|
@ -81,14 +81,14 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -123,8 +123,8 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
|
|||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
|
||||
|
||||
if(not sdata->overlap_comm)
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
|
@ -151,14 +151,14 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 6 * sizeof(X_FLOAT);
|
||||
int size = n * 6 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -193,8 +193,8 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
|
||||
|
||||
if(not sdata->overlap_comm)
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
|
@ -221,16 +221,16 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -278,16 +278,16 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 6 * sizeof(X_FLOAT);
|
||||
int size = n * 6 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -334,7 +334,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
@ -347,7 +347,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
|
|||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
|
@ -375,7 +375,7 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 6 * sizeof(X_FLOAT);
|
||||
int size = n * 6 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
@ -388,7 +388,7 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void
|
|||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
|
@ -414,22 +414,22 @@ int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* b
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(F_FLOAT);
|
||||
int size = n * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
||||
|
||||
F_FLOAT* buf = (F_FLOAT*)buf_send;
|
||||
F_FLOAT* f_dev = (F_FLOAT*)sdata->atom.f.dev_data;
|
||||
F_CFLOAT* buf = (F_CFLOAT*)buf_send;
|
||||
F_CFLOAT* f_dev = (F_CFLOAT*)sdata->atom.f.dev_data;
|
||||
f_dev += first;
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
buf += n;
|
||||
f_dev += sdata->atom.nmax;
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
buf += n;
|
||||
f_dev += sdata->atom.nmax;
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
return n * 3;
|
||||
}
|
||||
|
||||
|
@ -442,7 +442,7 @@ void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(F_FLOAT);
|
||||
int size = n * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
@ -468,7 +468,7 @@ void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap,
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_CommCuda_UpdateBuffer(sdata, n);
|
||||
|
@ -520,9 +520,9 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in
|
|||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(style == 1)
|
||||
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.slablo.dev_data, (X_CFLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
else
|
||||
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.multilo.dev_data, (X_CFLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
|
||||
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -31,13 +31,13 @@ __global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistl
|
|||
|
||||
if(j > _nmax) _flag[0] = 1;
|
||||
|
||||
((X_FLOAT*) buffer)[i] = _x[j] + dx;
|
||||
((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
|
||||
((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
|
||||
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
|
||||
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
|
||||
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
|
||||
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -47,16 +47,16 @@ __global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxli
|
|||
|
||||
if(j > _nmax) _flag[0] = 1;
|
||||
|
||||
((X_FLOAT*) buffer)[i] = _x[j] + dx;
|
||||
((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
|
||||
((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
|
||||
((X_FLOAT*) buffer)[i + 3 * n] = _v[j];
|
||||
((X_FLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
|
||||
((X_FLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
|
||||
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
|
||||
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
|
||||
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
|
||||
((X_CFLOAT*) buffer)[i + 3 * n] = _v[j];
|
||||
((X_CFLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
|
||||
((X_CFLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
|
||||
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
@ -72,7 +72,7 @@ __global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int max
|
|||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
|
||||
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
@ -96,9 +96,9 @@ __global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer)
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
_x[i + first] = ((X_FLOAT*) buffer)[i];
|
||||
_x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n];
|
||||
_x[i + first] = ((X_CFLOAT*) buffer)[i];
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -108,12 +108,12 @@ __global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffe
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
_x[i + first] = ((X_FLOAT*) buffer)[i];
|
||||
_x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n];
|
||||
_v[i + first] = ((X_FLOAT*) buffer)[i + 3 * n];
|
||||
_v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 4 * n];
|
||||
_v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 5 * n];
|
||||
_x[i + first] = ((X_CFLOAT*) buffer)[i];
|
||||
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
|
||||
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
|
||||
_v[i + first] = ((X_CFLOAT*) buffer)[i + 3 * n];
|
||||
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 4 * n];
|
||||
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 5 * n];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -122,9 +122,9 @@ __global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first)
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
((F_FLOAT*) _buffer)[i] = _f[i + first];
|
||||
((F_FLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
|
||||
((F_FLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
|
||||
((F_CFLOAT*) _buffer)[i] = _f[i + first];
|
||||
((F_CFLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
|
||||
((F_CFLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -136,9 +136,9 @@ __global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int max
|
|||
|
||||
if(i < n) {
|
||||
int j = list[i];
|
||||
_f[j] += ((F_FLOAT*)_buffer)[i];
|
||||
_f[j + _nmax] += ((F_FLOAT*) _buffer)[i + n];
|
||||
_f[j + 2 * _nmax] += ((F_FLOAT*) _buffer)[i + 2 * n];
|
||||
_f[j] += ((F_CFLOAT*)_buffer)[i];
|
||||
_f[j + _nmax] += ((F_CFLOAT*) _buffer)[i + n];
|
||||
_f[j + 2 * _nmax] += ((F_CFLOAT*) _buffer)[i + 2 * n];
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -161,11 +161,11 @@ __global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, in
|
|||
extern __shared__ int shared[];
|
||||
|
||||
__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst,
|
||||
int nfirst, int nlast, int dim, int iswap, X_FLOAT* slablo, X_FLOAT* slabhi, int* sendlist, int maxlistlength)
|
||||
int nfirst, int nlast, int dim, int iswap, X_CFLOAT* slablo, X_CFLOAT* slabhi, int* sendlist, int maxlistlength)
|
||||
{
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
X_FLOAT lo = slablo[iswap];
|
||||
X_FLOAT hi = slabhi[iswap];
|
||||
X_CFLOAT lo = slablo[iswap];
|
||||
X_CFLOAT hi = slabhi[iswap];
|
||||
bool add = false;
|
||||
|
||||
if(!bordergroup || ineed >= 2) {
|
||||
|
@ -273,11 +273,11 @@ __global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, i
|
|||
|
||||
|
||||
__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst
|
||||
, int nfirst, int nlast, int dim, int iswap, X_FLOAT* multilo, X_FLOAT* multihi, int* sendlist, int maxlistlength)
|
||||
, int nfirst, int nlast, int dim, int iswap, X_CFLOAT* multilo, X_CFLOAT* multihi, int* sendlist, int maxlistlength)
|
||||
{
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
X_FLOAT* mlo = &multilo[iswap * _cuda_ntypes];
|
||||
X_FLOAT* mhi = &multihi[iswap * _cuda_ntypes];
|
||||
X_CFLOAT* mlo = &multilo[iswap * _cuda_ntypes];
|
||||
X_CFLOAT* mhi = &multihi[iswap * _cuda_ntypes];
|
||||
int itype = 0;
|
||||
bool add = false;
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
|
||||
void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
|
||||
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -50,15 +50,15 @@ void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
|
|||
void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
if(sdata->atom.rmass_flag)
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
|
@ -68,7 +68,7 @@ void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
|
|||
}
|
||||
|
||||
|
||||
void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t)
|
||||
void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempCuda_UpdateNmax(sdata);
|
||||
|
@ -82,7 +82,7 @@ void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_F
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit);
|
||||
Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
|
||||
|
||||
|
@ -90,13 +90,13 @@ void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_F
|
|||
grid.x = 6;
|
||||
grid.y = 1;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
|
||||
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t)
|
||||
void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempCuda_UpdateNmax(sdata);
|
||||
|
@ -111,7 +111,7 @@ void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_F
|
|||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
|
||||
Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit);
|
||||
Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
|
||||
|
||||
|
@ -119,7 +119,7 @@ void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_F
|
|||
grid.x = 1;
|
||||
grid.y = 1;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
|
||||
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
|
||||
}
|
||||
|
|
|
@ -24,5 +24,5 @@
|
|||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t);
|
||||
extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t);
|
||||
extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
|
||||
extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ ENERGY_FLOAT sharedmem[];
|
||||
extern __shared__ ENERGY_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
|
||||
|
@ -40,7 +40,7 @@ __global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
|
|||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
|
||||
|
@ -59,7 +59,7 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
|
|||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
V_FLOAT massone;
|
||||
V_CFLOAT massone;
|
||||
|
||||
if(_rmass_flag) massone = _rmass[i];
|
||||
else massone = _mass[_type[i]];
|
||||
|
@ -78,7 +78,7 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
|
|||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[4 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[5 * blockDim.x]);
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
|
||||
|
@ -91,12 +91,12 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
|
|||
}
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t)
|
||||
__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
ENERGY_FLOAT myforig = 0.0;
|
||||
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT myforig = 0.0;
|
||||
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
|
||||
void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
|
||||
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -50,15 +50,15 @@ void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
|
|||
void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
if(sdata->atom.rmass_flag)
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
|
@ -68,7 +68,7 @@ void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
|
|||
}
|
||||
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
|
||||
void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
|
||||
|
@ -82,20 +82,20 @@ void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, E
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
|
||||
Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x * grid.y;
|
||||
grid.x = 6;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
|
||||
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
|
||||
}
|
||||
}
|
||||
|
||||
void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
|
||||
void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
|
||||
{
|
||||
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
|
||||
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
|
||||
|
@ -110,14 +110,14 @@ void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, E
|
|||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
|
||||
Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
|
||||
Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x * grid.y;
|
||||
grid.x = 1;
|
||||
threads.x = 512;
|
||||
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
|
||||
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
|
||||
}
|
||||
|
@ -137,7 +137,7 @@ void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int grou
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
|
||||
Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
|
||||
}
|
||||
|
@ -157,7 +157,7 @@ void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int gro
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
|
||||
Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
|
||||
extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ ENERGY_FLOAT sharedmem[];
|
||||
extern __shared__ ENERGY_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag)
|
||||
|
@ -40,7 +40,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xfla
|
|||
}
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
|
@ -59,7 +59,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla
|
|||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
V_FLOAT massone;
|
||||
V_CFLOAT massone;
|
||||
|
||||
if(_rmass_flag) massone = _rmass[i];
|
||||
else massone = _mass[_type[i]];
|
||||
|
@ -78,7 +78,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla
|
|||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[4 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[5 * blockDim.x]);
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
|
@ -91,12 +91,12 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla
|
|||
}
|
||||
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t)
|
||||
__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
ENERGY_FLOAT myforig = 0.0;
|
||||
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT myforig = 0.0;
|
||||
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
|
@ -117,7 +117,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t
|
|||
t[blockIdx.x] = myforig;
|
||||
}
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall)
|
||||
__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
@ -140,7 +140,7 @@ __global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, i
|
|||
}
|
||||
}
|
||||
|
||||
__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall)
|
||||
__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
|
|
@ -640,11 +640,11 @@ static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
|
|||
return __hiloint2double(v.y, v.x);
|
||||
}
|
||||
|
||||
static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
|
||||
static __device__ inline X_CFLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
|
||||
{
|
||||
int4 v = tex1Dfetch(t, 2 * i);
|
||||
int4 u = tex1Dfetch(t, 2 * i + 1);
|
||||
X_FLOAT4 w;
|
||||
X_CFLOAT4 w;
|
||||
|
||||
w.x = __hiloint2double(v.y, v.x);
|
||||
w.y = __hiloint2double(v.w, v.z);
|
||||
|
@ -664,7 +664,7 @@ inline void BindXTypeTexture(cuda_shared_data* sdata)
|
|||
|
||||
#if X_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
|
||||
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
|
||||
|
@ -672,7 +672,7 @@ inline void BindXTypeTexture(cuda_shared_data* sdata)
|
|||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline X_FLOAT4 fetchXType(int i)
|
||||
static __device__ inline X_CFLOAT4 fetchXType(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if X_PRECISION == 1
|
||||
|
@ -692,11 +692,11 @@ static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
|
|||
return __hiloint2double(v.y, v.x);
|
||||
}
|
||||
|
||||
static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
|
||||
static __device__ inline V_CFLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
|
||||
{
|
||||
int4 v = tex1Dfetch(t, 2 * i);
|
||||
int4 u = tex1Dfetch(t, 2 * i + 1);
|
||||
V_FLOAT4 w;
|
||||
V_CFLOAT4 w;
|
||||
|
||||
w.x = __hiloint2double(v.y, v.x);
|
||||
w.y = __hiloint2double(v.w, v.z);
|
||||
|
@ -716,7 +716,7 @@ inline void BindVRadiusTexture(cuda_shared_data* sdata)
|
|||
|
||||
#if V_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
|
||||
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
|
||||
|
@ -724,7 +724,7 @@ inline void BindVRadiusTexture(cuda_shared_data* sdata)
|
|||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline V_FLOAT4 fetchVRadius(int i)
|
||||
static __device__ inline V_CFLOAT4 fetchVRadius(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if V_PRECISION == 1
|
||||
|
@ -747,7 +747,7 @@ inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
|
|||
|
||||
#if V_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
|
||||
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
|
||||
|
@ -755,7 +755,7 @@ inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
|
|||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
|
||||
static __device__ inline V_CFLOAT4 fetchOmegaRmass(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if V_PRECISION == 1
|
||||
|
@ -775,11 +775,11 @@ static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
|
|||
return __hiloint2double(v.y, v.x);
|
||||
}
|
||||
|
||||
static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
|
||||
static __device__ inline F_CFLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
|
||||
{
|
||||
int4 v = tex1Dfetch(t, 2 * i);
|
||||
int4 u = tex1Dfetch(t, 2 * i + 1);
|
||||
F_FLOAT4 w;
|
||||
F_CFLOAT4 w;
|
||||
|
||||
w.x = __hiloint2double(v.y, v.x);
|
||||
w.y = __hiloint2double(v.w, v.z);
|
||||
|
@ -799,7 +799,7 @@ inline void BindQTexture(cuda_shared_data* sdata)
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
|
||||
|
@ -807,7 +807,7 @@ inline void BindQTexture(cuda_shared_data* sdata)
|
|||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline F_FLOAT fetchQ(int i)
|
||||
static __device__ inline F_CFLOAT fetchQ(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if F_PRECISION == 1
|
||||
|
@ -835,7 +835,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
|
||||
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
|
||||
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_CFLOAT4));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
|
||||
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
|
||||
|
@ -843,7 +843,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
|
|||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline X_FLOAT4 fetchXType(int i)
|
||||
static __device__ inline X_CFLOAT4 fetchXType(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if X_PRECISION == 1
|
||||
|
@ -863,7 +863,7 @@ static inline __device__ int sbmask(int j)
|
|||
return j >> SBBITS & 3;
|
||||
}
|
||||
|
||||
static inline __device__ void minimum_image(X_FLOAT4 &delta)
|
||||
static inline __device__ void minimum_image(X_CFLOAT4 &delta)
|
||||
{
|
||||
if(_triclinic == 0) {
|
||||
if(_periodicity[0]) {
|
||||
|
@ -907,7 +907,7 @@ static inline __device__ void minimum_image(X_FLOAT4 &delta)
|
|||
}
|
||||
}
|
||||
|
||||
static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
|
||||
static inline __device__ void closest_image(X_CFLOAT4 &x1, X_CFLOAT4 &x2, X_CFLOAT4 &ci)
|
||||
{
|
||||
ci.x = x2.x - x1.x;
|
||||
ci.y = x2.y - x1.y;
|
||||
|
|
|
@ -4,12 +4,12 @@
|
|||
|
||||
void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
|
||||
{
|
||||
sdata->compile_settings.prec_glob = sizeof(CUDA_FLOAT) / 4;
|
||||
sdata->compile_settings.prec_x = sizeof(X_FLOAT) / 4;
|
||||
sdata->compile_settings.prec_v = sizeof(V_FLOAT) / 4;
|
||||
sdata->compile_settings.prec_f = sizeof(F_FLOAT) / 4;
|
||||
sdata->compile_settings.prec_pppm = sizeof(PPPM_FLOAT) / 4;
|
||||
sdata->compile_settings.prec_fft = sizeof(FFT_FLOAT) / 4;
|
||||
sdata->compile_settings.prec_glob = sizeof(CUDA_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_x = sizeof(X_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_v = sizeof(V_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_f = sizeof(F_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_pppm = sizeof(PPPM_CFLOAT) / 4;
|
||||
sdata->compile_settings.prec_fft = sizeof(FFT_CFLOAT) / 4;
|
||||
|
||||
#ifdef FFT_CUFFT
|
||||
sdata->compile_settings.cufft = 1;
|
||||
|
|
|
@ -60,7 +60,7 @@
|
|||
//#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
|
||||
//#define &MY_AP(var) &(MY_AP(var))
|
||||
#define CUDA_USE_TEXTURE
|
||||
#define CUDA_USE_FLOAT4
|
||||
#define CUDA_USE_CFLOAT4
|
||||
|
||||
//constants used by many classes
|
||||
|
||||
|
@ -79,20 +79,20 @@
|
|||
#define _h MY_AP(h)
|
||||
#define _h_inv MY_AP(h_inv)
|
||||
#define _h_rate MY_AP(h_rate)
|
||||
__device__ __constant__ X_FLOAT _boxhi[3];
|
||||
__device__ __constant__ X_FLOAT _boxlo[3];
|
||||
__device__ __constant__ X_FLOAT _subhi[3];
|
||||
__device__ __constant__ X_FLOAT _sublo[3];
|
||||
__device__ __constant__ X_FLOAT _box_size[3];
|
||||
__device__ __constant__ X_FLOAT _prd[3];
|
||||
__device__ __constant__ X_CFLOAT _boxhi[3];
|
||||
__device__ __constant__ X_CFLOAT _boxlo[3];
|
||||
__device__ __constant__ X_CFLOAT _subhi[3];
|
||||
__device__ __constant__ X_CFLOAT _sublo[3];
|
||||
__device__ __constant__ X_CFLOAT _box_size[3];
|
||||
__device__ __constant__ X_CFLOAT _prd[3];
|
||||
__device__ __constant__ int _periodicity[3];
|
||||
__device__ __constant__ int _triclinic;
|
||||
__device__ __constant__ X_FLOAT _boxhi_lamda[3];
|
||||
__device__ __constant__ X_FLOAT _boxlo_lamda[3];
|
||||
__device__ __constant__ X_FLOAT _prd_lamda[3];
|
||||
__device__ __constant__ X_FLOAT _h[6];
|
||||
__device__ __constant__ X_FLOAT _h_inv[6];
|
||||
__device__ __constant__ V_FLOAT _h_rate[6];
|
||||
__device__ __constant__ X_CFLOAT _boxhi_lamda[3];
|
||||
__device__ __constant__ X_CFLOAT _boxlo_lamda[3];
|
||||
__device__ __constant__ X_CFLOAT _prd_lamda[3];
|
||||
__device__ __constant__ X_CFLOAT _h[6];
|
||||
__device__ __constant__ X_CFLOAT _h_inv[6];
|
||||
__device__ __constant__ V_CFLOAT _h_rate[6];
|
||||
|
||||
|
||||
//atom properties
|
||||
|
@ -123,31 +123,31 @@ __device__ __constant__ V_FLOAT _h_rate[6];
|
|||
#define _omega_rmass MY_AP(omega_rmass)
|
||||
#define _freeze_group_bit MY_AP(freeze_group_bit)
|
||||
#define _map_array MY_AP(map_array)
|
||||
__device__ __constant__ X_FLOAT* _x; //holds pointer to positions
|
||||
__device__ __constant__ V_FLOAT* _v;
|
||||
__device__ __constant__ F_FLOAT* _f;
|
||||
__device__ __constant__ X_CFLOAT* _x; //holds pointer to positions
|
||||
__device__ __constant__ V_CFLOAT* _v;
|
||||
__device__ __constant__ F_CFLOAT* _f;
|
||||
__device__ __constant__ int* _tag;
|
||||
__device__ __constant__ int* _type;
|
||||
__device__ __constant__ int* _mask;
|
||||
__device__ __constant__ int* _image;
|
||||
__device__ __constant__ V_FLOAT* _mass;
|
||||
__device__ __constant__ F_FLOAT* _q;
|
||||
__device__ __constant__ V_FLOAT* _rmass;
|
||||
__device__ __constant__ V_CFLOAT* _mass;
|
||||
__device__ __constant__ F_CFLOAT* _q;
|
||||
__device__ __constant__ V_CFLOAT* _rmass;
|
||||
__device__ __constant__ int _rmass_flag;
|
||||
__device__ __constant__ ENERGY_FLOAT* _eatom;
|
||||
__device__ __constant__ ENERGY_FLOAT* _vatom;
|
||||
__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions
|
||||
__device__ __constant__ X_FLOAT* _radius;
|
||||
__device__ __constant__ F_FLOAT* _density;
|
||||
__device__ __constant__ V_FLOAT* _omega;
|
||||
__device__ __constant__ F_FLOAT* _torque;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _eatom;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _vatom;
|
||||
__device__ __constant__ X_CFLOAT4* _x_type; //holds pointer to positions
|
||||
__device__ __constant__ X_CFLOAT* _radius;
|
||||
__device__ __constant__ F_CFLOAT* _density;
|
||||
__device__ __constant__ V_CFLOAT* _omega;
|
||||
__device__ __constant__ F_CFLOAT* _torque;
|
||||
__device__ __constant__ int* _special;
|
||||
__device__ __constant__ int _maxspecial;
|
||||
__device__ __constant__ int* _nspecial;
|
||||
__device__ __constant__ int _special_flag[4];
|
||||
__device__ __constant__ int* _molecule;
|
||||
__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions
|
||||
__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions
|
||||
__device__ __constant__ V_CFLOAT4* _v_radius; //holds pointer to positions
|
||||
__device__ __constant__ V_CFLOAT4* _omega_rmass; //holds pointer to positions
|
||||
__device__ __constant__ int _freeze_group_bit;
|
||||
__device__ __constant__ int* _map_array;
|
||||
|
||||
|
@ -226,8 +226,8 @@ __device__ __constant__ int* _neighbors;
|
|||
__device__ __constant__ int* _neighbors_border;
|
||||
__device__ __constant__ int* _neighbors_inner;
|
||||
__device__ __constant__ int* _reneigh_flag;
|
||||
__device__ __constant__ X_FLOAT _triggerneighsq;
|
||||
__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions
|
||||
__device__ __constant__ X_CFLOAT _triggerneighsq;
|
||||
__device__ __constant__ X_CFLOAT* _xhold; //holds pointer to positions
|
||||
__device__ __constant__ int _maxhold;
|
||||
__device__ __constant__ int _dist_check;
|
||||
__device__ __constant__ int _neighbor_maxlocal;
|
||||
|
@ -253,12 +253,12 @@ __device__ __constant__ unsigned _nghost;
|
|||
__device__ __constant__ unsigned _nlocal;
|
||||
__device__ __constant__ unsigned _nmax;
|
||||
__device__ __constant__ unsigned _cuda_ntypes;
|
||||
__device__ __constant__ V_FLOAT _dtf;
|
||||
__device__ __constant__ X_FLOAT _dtv;
|
||||
__device__ __constant__ V_FLOAT _factor;
|
||||
__device__ __constant__ ENERGY_FLOAT* _virial;
|
||||
__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
|
||||
__device__ __constant__ ENERGY_FLOAT* _eng_coul;
|
||||
__device__ __constant__ V_CFLOAT _dtf;
|
||||
__device__ __constant__ X_CFLOAT _dtv;
|
||||
__device__ __constant__ V_CFLOAT _factor;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _virial;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _eng_vdwl;
|
||||
__device__ __constant__ ENERGY_CFLOAT* _eng_coul;
|
||||
__device__ __constant__ int _molecular;
|
||||
|
||||
//other general constants
|
||||
|
|
|
@ -55,30 +55,30 @@ enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_L
|
|||
#define _cutsq_global MY_AP(cutsq_global)
|
||||
#define _collect_forces_later MY_AP(collect_forces_later)
|
||||
|
||||
__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_FLOAT _special_lj[4];
|
||||
__device__ __constant__ F_FLOAT _special_coul[4];
|
||||
__device__ __constant__ X_FLOAT _cutsq_global;
|
||||
__device__ __constant__ X_CFLOAT _cutsq[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ ENERGY_CFLOAT _offset[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_CFLOAT _special_lj[4];
|
||||
__device__ __constant__ F_CFLOAT _special_coul[4];
|
||||
__device__ __constant__ X_CFLOAT _cutsq_global;
|
||||
__device__ __constant__ int _collect_forces_later;
|
||||
|
||||
__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space)
|
||||
__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_CFLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space)
|
||||
__device__ __constant__ F_CFLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_CFLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_CFLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ F_CFLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2];
|
||||
|
||||
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space)
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space)
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff2_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff3_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff4_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff5_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff6_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff7_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff8_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff9_gm);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(coeff10_gm);
|
||||
|
||||
#define _coeff1_gm_tex MY_AP(coeff1_gm_tex)
|
||||
#if F_PRECISION == 1
|
||||
|
@ -159,17 +159,17 @@ texture<int2, 1> _coeff10_gm_tex;
|
|||
#define _g_ewald MY_AP(g_ewald)
|
||||
#define _qqrd2e MY_AP(qqrd2e)
|
||||
#define _kappa MY_AP(kappa)
|
||||
__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ X_FLOAT _cut_coulsq_global;
|
||||
__device__ __constant__ F_FLOAT _g_ewald;
|
||||
__device__ __constant__ F_FLOAT _qqrd2e;
|
||||
__device__ __constant__ F_FLOAT _kappa;
|
||||
__device__ __constant__ X_CFLOAT _cut_coulsq[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ X_CFLOAT _cut_coulsq_global;
|
||||
__device__ __constant__ F_CFLOAT _g_ewald;
|
||||
__device__ __constant__ F_CFLOAT _qqrd2e;
|
||||
__device__ __constant__ F_CFLOAT _kappa;
|
||||
|
||||
//inner cutoff
|
||||
#define _cut_innersq MY_AP(cut_innersq)
|
||||
#define _cut_innersq_global MY_AP(cut_innersq_global)
|
||||
__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ X_FLOAT _cut_innersq_global;
|
||||
__device__ __constant__ X_CFLOAT _cut_innersq[CUDA_MAX_TYPES2];
|
||||
__device__ __constant__ X_CFLOAT _cut_innersq_global;
|
||||
|
||||
|
||||
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
|
||||
|
@ -241,14 +241,14 @@ void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighli
|
|||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
|
||||
//Atom
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
|
||||
|
||||
//Other
|
||||
|
@ -261,8 +261,8 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
{
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
|
||||
unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
|
||||
unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2;
|
||||
unsigned n = sizeof(F_CFLOAT) * cuda_ntypes2;
|
||||
unsigned nx = sizeof(X_CFLOAT) * cuda_ntypes2;
|
||||
|
||||
//check if enough constant memory is available
|
||||
if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
|
||||
|
@ -275,24 +275,24 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
//type conversion of cutoffs and parameters
|
||||
if(need_cut) {
|
||||
X_FLOAT cutsq[cuda_ntypes2];
|
||||
X_CFLOAT cutsq[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
|
||||
cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
|
||||
}
|
||||
}
|
||||
|
||||
int cutsqdiffer = 0;
|
||||
X_FLOAT cutsq_global;
|
||||
cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
|
||||
X_CFLOAT cutsq_global;
|
||||
cutsq_global = (X_CFLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
|
||||
|
||||
if(sdata->pair.cut) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut[i][j] > 1e-6) {
|
||||
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
|
||||
cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
|
||||
cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
|
||||
cutsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
|
||||
|
@ -307,8 +307,8 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut[i][j] > 1e-6) {
|
||||
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
|
||||
cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
|
||||
cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cutsq[i][j]);
|
||||
cutsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cutsq[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
|
||||
|
@ -326,28 +326,28 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx);
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_CFLOAT));
|
||||
}
|
||||
|
||||
if(need_innercut) {
|
||||
X_FLOAT cut_innersq[cuda_ntypes2];
|
||||
X_CFLOAT cut_innersq[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
|
||||
cut_innersq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
|
||||
}
|
||||
}
|
||||
|
||||
int cutsqdiffer = 0;
|
||||
X_FLOAT cut_innersq_global;
|
||||
cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
|
||||
X_CFLOAT cut_innersq_global;
|
||||
cut_innersq_global = (X_CFLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
|
||||
|
||||
if(sdata->pair.cut_inner) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut_inner[i][j] > 1e-6) {
|
||||
cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
|
||||
cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
|
||||
cut_innersq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
|
||||
cut_innersq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
|
||||
|
@ -363,30 +363,30 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx);
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_CFLOAT));
|
||||
}
|
||||
|
||||
if(need_q) {
|
||||
X_FLOAT cut_coulsq[cuda_ntypes2];
|
||||
X_CFLOAT cut_coulsq[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
|
||||
cut_coulsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
|
||||
}
|
||||
}
|
||||
|
||||
int cutsqdiffer = 0;
|
||||
X_FLOAT cut_coulsq_global;
|
||||
cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
|
||||
X_CFLOAT cut_coulsq_global;
|
||||
cut_coulsq_global = (X_CFLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
|
||||
|
||||
if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global;
|
||||
if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_CFLOAT) sdata->pair.cut_coulsq_global;
|
||||
|
||||
if(sdata->pair.cut_coul) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = i; j <= sdata->atom.ntypes; ++j) {
|
||||
if(sdata->pair.cut_coul[i][j] > 1e-6) {
|
||||
cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
|
||||
cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
|
||||
cut_coulsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
|
||||
cut_coulsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
|
||||
}
|
||||
|
||||
if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
|
||||
|
@ -402,22 +402,22 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx);
|
||||
}
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_CFLOAT));
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
|
||||
|
||||
if(ncoeff > 0) {
|
||||
F_FLOAT coeff1[cuda_ntypes2];
|
||||
F_CFLOAT coeff1[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
|
||||
coeff1[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff1[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice);
|
||||
|
||||
_coeff1_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
|
@ -429,7 +429,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed");
|
||||
cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed");
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
|
@ -445,16 +445,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
|
||||
|
||||
if(ncoeff > 1) {
|
||||
F_FLOAT coeff2[cuda_ntypes2];
|
||||
F_CFLOAT coeff2[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
|
||||
coeff2[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff2[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice);
|
||||
|
||||
_coeff2_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
|
@ -464,7 +464,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -477,16 +477,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
|
||||
|
||||
if(ncoeff > 2) {
|
||||
F_FLOAT coeff3[cuda_ntypes2];
|
||||
F_CFLOAT coeff3[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
|
||||
coeff3[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff3[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice);
|
||||
_coeff3_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
|
@ -495,7 +495,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -507,16 +507,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
|
||||
|
||||
if(ncoeff > 3) {
|
||||
F_FLOAT coeff4[cuda_ntypes2];
|
||||
F_CFLOAT coeff4[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
|
||||
coeff4[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff4[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice);
|
||||
_coeff4_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
|
@ -525,7 +525,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -537,16 +537,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
|
||||
|
||||
if(ncoeff > 4) {
|
||||
F_FLOAT coeff5[cuda_ntypes2];
|
||||
F_CFLOAT coeff5[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
|
||||
coeff5[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff5[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice);
|
||||
_coeff5_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
|
@ -555,7 +555,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -567,16 +567,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
|
||||
|
||||
if(ncoeff > 5) {
|
||||
F_FLOAT coeff6[cuda_ntypes2];
|
||||
F_CFLOAT coeff6[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
|
||||
coeff6[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff6[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice);
|
||||
_coeff6_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
|
@ -585,7 +585,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -596,16 +596,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
|
||||
|
||||
if(ncoeff > 6) {
|
||||
F_FLOAT coeff7[cuda_ntypes2];
|
||||
F_CFLOAT coeff7[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
|
||||
coeff7[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff7[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice);
|
||||
_coeff7_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
|
@ -614,7 +614,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -625,16 +625,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
|
||||
|
||||
if(ncoeff > 7) {
|
||||
F_FLOAT coeff8[cuda_ntypes2];
|
||||
F_CFLOAT coeff8[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
|
||||
coeff8[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff8[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice);
|
||||
_coeff8_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
|
@ -643,7 +643,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -654,16 +654,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
|
||||
|
||||
if(ncoeff > 8) {
|
||||
F_FLOAT coeff9[cuda_ntypes2];
|
||||
F_CFLOAT coeff9[cuda_ntypes2];
|
||||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
|
||||
coeff9[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff9[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if(use_global_params) {
|
||||
cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_CFLOAT*));
|
||||
cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice);
|
||||
_coeff9_gm_tex.normalized = false; // access with normalized texture coordinates
|
||||
_coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
|
||||
|
@ -672,7 +672,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
#if F_PRECISION == 1
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
|
||||
cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
|
||||
cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
|
||||
#else
|
||||
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
|
||||
cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
|
||||
|
@ -682,40 +682,40 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
|
||||
|
||||
F_FLOAT special_lj[4];
|
||||
F_CFLOAT special_lj[4];
|
||||
special_lj[0] = sdata->pair.special_lj[0];
|
||||
special_lj[1] = sdata->pair.special_lj[1];
|
||||
special_lj[2] = sdata->pair.special_lj[2];
|
||||
special_lj[3] = sdata->pair.special_lj[3];
|
||||
|
||||
|
||||
X_FLOAT box_size[3] = {
|
||||
X_CFLOAT box_size[3] = {
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4);
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_CFLOAT) * 4);
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
|
||||
|
||||
if(need_q) {
|
||||
F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
|
||||
F_FLOAT special_coul[4];
|
||||
F_CFLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
|
||||
F_CFLOAT special_coul[4];
|
||||
special_coul[0] = sdata->pair.special_coul[0];
|
||||
special_coul[1] = sdata->pair.special_coul[1];
|
||||
special_coul[2] = sdata->pair.special_coul[2];
|
||||
special_coul[3] = sdata->pair.special_coul[3];
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4);
|
||||
cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_CFLOAT) * 4);
|
||||
cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_CFLOAT*));
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init failed");
|
||||
|
@ -763,7 +763,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
|
|||
maxthreads = 64;
|
||||
}
|
||||
|
||||
int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
|
||||
int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_CFLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
|
||||
threads.x = layout.z;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
|
@ -771,9 +771,9 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
|
|||
grid.y = layout.y;
|
||||
grid.z = 1;
|
||||
|
||||
int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT);
|
||||
int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_CFLOAT);
|
||||
|
||||
if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT));
|
||||
if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_CFLOAT));
|
||||
|
||||
Cuda_UpdateBuffer(sdata, size);
|
||||
|
||||
|
@ -787,7 +787,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
|
|||
|
||||
my_gettime(CLOCK_REALTIME, &startpairtime);
|
||||
|
||||
MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
|
||||
MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);)
|
||||
}
|
||||
|
||||
//Function which is called after the kernel invocation, collects energy and virial
|
||||
|
@ -810,8 +810,8 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sh
|
|||
|
||||
grid.y = 1;
|
||||
dim3 threads(128, 1, 1);
|
||||
MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
|
||||
MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);)
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed");
|
||||
}
|
||||
|
@ -863,15 +863,15 @@ void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
|
||||
CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
|
||||
}
|
||||
|
@ -999,7 +999,7 @@ void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag)
|
|||
grid.y = 1;
|
||||
threads.x = 128;
|
||||
//printf("A grid.x: %i\n",grid.x);
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
|
||||
}
|
||||
|
|
|
@ -32,12 +32,12 @@
|
|||
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
|
||||
__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
|
||||
|
||||
ENERGY_FLOAT* sharedE;
|
||||
ENERGY_FLOAT* sharedECoul;
|
||||
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
ENERGY_CFLOAT* sharedE;
|
||||
ENERGY_CFLOAT* sharedECoul;
|
||||
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
|
||||
if(eflag || eflag_atom) {
|
||||
sharedE = &sharedmem[threadIdx.x];
|
||||
|
@ -62,12 +62,12 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
|
||||
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_FLOAT4 myxtype;
|
||||
F_FLOAT fxtmp, fytmp, fztmp, fpair;
|
||||
F_FLOAT delx, dely, delz;
|
||||
F_FLOAT factor_lj, factor_coul;
|
||||
F_FLOAT qtmp;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT4 myxtype;
|
||||
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
|
||||
F_CFLOAT delx, dely, delz;
|
||||
F_CFLOAT factor_lj, factor_coul;
|
||||
F_CFLOAT qtmp;
|
||||
int itype, i, j;
|
||||
int jnum = 0;
|
||||
int* jlist;
|
||||
|
@ -114,7 +114,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
int jtype = static_cast <int>(myxtype.w);
|
||||
|
||||
|
||||
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||||
|
||||
|
@ -171,7 +171,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
}
|
||||
|
||||
if(coul_type != COUL_NONE) {
|
||||
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||||
const F_CFLOAT qiqj = qtmp * fetchQ(j);
|
||||
|
||||
if(qiqj * qiqj > 1e-8) {
|
||||
const bool in_coul_cutoff =
|
||||
|
@ -188,7 +188,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
break;
|
||||
|
||||
case COUL_CUT: {
|
||||
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul;
|
||||
|
@ -199,11 +199,11 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
break;
|
||||
|
||||
case COUL_DEBYE: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_FLOAT r = _RSQRT_(r2inv);
|
||||
const X_FLOAT rinv = F_F(1.0) / r;
|
||||
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||||
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_CFLOAT r = _RSQRT_(r2inv);
|
||||
const X_CFLOAT rinv = F_F(1.0) / r;
|
||||
const F_CFLOAT screening = _EXP_(-_kappa * r);
|
||||
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul * rinv;
|
||||
|
@ -219,14 +219,14 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
break;
|
||||
|
||||
case COUL_LONG: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r = _RSQRT_(r2inv);
|
||||
const F_FLOAT grij = _g_ewald * r;
|
||||
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r = _RSQRT_(r2inv);
|
||||
const F_CFLOAT grij = _g_ewald * r;
|
||||
const F_CFLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
|
||||
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||||
|
||||
|
@ -248,7 +248,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
|
||||
|
||||
if(in_cutoff) {
|
||||
F_FLOAT dxfp, dyfp, dzfp;
|
||||
F_CFLOAT dxfp, dyfp, dzfp;
|
||||
fxtmp += dxfp = delx * fpair;
|
||||
fytmp += dyfp = dely * fpair;
|
||||
fztmp += dzfp = delz * fpair;
|
||||
|
@ -268,10 +268,10 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
__syncthreads();
|
||||
|
||||
if(ii < _inum) {
|
||||
F_FLOAT* my_f;
|
||||
F_CFLOAT* my_f;
|
||||
|
||||
if(_collect_forces_later) {
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(eflag) {
|
||||
buffer = &buffer[1 * gridDim.x * gridDim.y];
|
||||
|
@ -284,7 +284,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||||
}
|
||||
|
||||
my_f = (F_FLOAT*) buffer;
|
||||
my_f = (F_CFLOAT*) buffer;
|
||||
my_f += i;
|
||||
*my_f = fxtmp;
|
||||
my_f += _nmax;
|
||||
|
@ -337,14 +337,14 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
if(ii >= _inum)
|
||||
return;
|
||||
|
||||
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||||
F_FLOAT3* sharedVirial1;
|
||||
F_FLOAT3* sharedVirial2;
|
||||
F_FLOAT* sharedEnergy;
|
||||
F_FLOAT* sharedEnergyCoul;
|
||||
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
|
||||
F_CFLOAT3* sharedVirial1;
|
||||
F_CFLOAT3* sharedVirial2;
|
||||
F_CFLOAT* sharedEnergy;
|
||||
F_CFLOAT* sharedEnergyCoul;
|
||||
|
||||
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
|
||||
F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];
|
||||
|
||||
if(vflag) {
|
||||
sharedVirial1 = &sharedForce[64];
|
||||
|
@ -356,25 +356,25 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
|
||||
if(eflag) {
|
||||
if(vflag || vflag_atom)
|
||||
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
|
||||
sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
|
||||
else
|
||||
sharedEnergy = (F_FLOAT*) &sharedForce[64];
|
||||
sharedEnergy = (F_CFLOAT*) &sharedForce[64];
|
||||
|
||||
if(coul_type != COUL_NONE)
|
||||
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
|
||||
sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];
|
||||
|
||||
}
|
||||
|
||||
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_FLOAT4 myxtype;
|
||||
F_FLOAT delx, dely, delz;
|
||||
F_FLOAT factor_lj, factor_coul;
|
||||
F_FLOAT fpair;
|
||||
F_FLOAT qtmp;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT4 myxtype;
|
||||
F_CFLOAT delx, dely, delz;
|
||||
F_CFLOAT factor_lj, factor_coul;
|
||||
F_CFLOAT fpair;
|
||||
F_CFLOAT qtmp;
|
||||
int itype, jnum, i, j;
|
||||
int* jlist;
|
||||
|
||||
|
@ -413,7 +413,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
delz = ztmp - myxtype.z;
|
||||
int jtype = static_cast <int>(myxtype.w);
|
||||
|
||||
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||||
bool in_coul_cutoff;
|
||||
|
@ -471,7 +471,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
}
|
||||
|
||||
if(coul_type != COUL_NONE) {
|
||||
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||||
const F_CFLOAT qiqj = qtmp * fetchQ(j);
|
||||
|
||||
if(qiqj * qiqj > (1e-8f)) {
|
||||
in_coul_cutoff =
|
||||
|
@ -492,14 +492,14 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
break;
|
||||
|
||||
case COUL_LONG: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r = _RSQRT_(r2inv);
|
||||
const F_FLOAT grij = _g_ewald * r;
|
||||
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r = _RSQRT_(r2inv);
|
||||
const F_CFLOAT grij = _g_ewald * r;
|
||||
const F_CFLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
|
||||
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||||
|
||||
|
@ -514,11 +514,11 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
break;
|
||||
|
||||
case COUL_DEBYE: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_FLOAT r = _RSQRT_(r2inv);
|
||||
const X_FLOAT rinv = F_F(1.0) / r;
|
||||
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||||
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_CFLOAT r = _RSQRT_(r2inv);
|
||||
const X_CFLOAT rinv = F_F(1.0) / r;
|
||||
const F_CFLOAT screening = _EXP_(-_kappa * r);
|
||||
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul * rinv;
|
||||
|
@ -530,7 +530,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
break;
|
||||
|
||||
case COUL_CUT: {
|
||||
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul;
|
||||
|
@ -549,7 +549,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
|
||||
|
||||
if(in_cutoff || in_coul_cutoff) {
|
||||
F_FLOAT dxfp, dyfp, dzfp;
|
||||
F_CFLOAT dxfp, dyfp, dzfp;
|
||||
partialForce.x += dxfp = delx * fpair;
|
||||
partialForce.y += dyfp = dely * fpair;
|
||||
partialForce.z += dzfp = delz * fpair;
|
||||
|
@ -613,10 +613,10 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
|
||||
if(threadIdx.x == 0) {
|
||||
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(eflag) {
|
||||
ENERGY_FLOAT tmp_evdwl;
|
||||
ENERGY_CFLOAT tmp_evdwl;
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
|
||||
|
||||
if(eflag_atom)
|
||||
|
@ -635,7 +635,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
}
|
||||
|
||||
if(vflag) {
|
||||
ENERGY_FLOAT tmp;
|
||||
ENERGY_CFLOAT tmp;
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
|
||||
|
||||
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
|
||||
|
@ -663,10 +663,10 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||||
}
|
||||
|
||||
F_FLOAT* my_f;
|
||||
F_CFLOAT* my_f;
|
||||
|
||||
if(_collect_forces_later) {
|
||||
my_f = (F_FLOAT*) buffer;
|
||||
my_f = (F_CFLOAT*) buffer;
|
||||
my_f += i;
|
||||
*my_f = sharedForce[0].x;
|
||||
my_f += _nmax;
|
||||
|
@ -688,12 +688,12 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
|
|||
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
|
||||
__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
|
||||
{
|
||||
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
|
||||
|
||||
ENERGY_FLOAT* sharedE;
|
||||
ENERGY_FLOAT* sharedECoul;
|
||||
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
ENERGY_CFLOAT* sharedE;
|
||||
ENERGY_CFLOAT* sharedECoul;
|
||||
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
|
||||
if(eflag || eflag_atom) {
|
||||
sharedE = &sharedmem[threadIdx.x];
|
||||
|
@ -718,12 +718,12 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_FLOAT4 myxtype;
|
||||
F_FLOAT fxtmp, fytmp, fztmp, fpair;
|
||||
F_FLOAT delx, dely, delz;
|
||||
F_FLOAT factor_lj, factor_coul;
|
||||
F_FLOAT qtmp;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT4 myxtype;
|
||||
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
|
||||
F_CFLOAT delx, dely, delz;
|
||||
F_CFLOAT factor_lj, factor_coul;
|
||||
F_CFLOAT qtmp;
|
||||
int itype, i, j;
|
||||
int jnum = 0;
|
||||
int* jlist;
|
||||
|
@ -774,7 +774,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
int jtype = static_cast <int>(myxtype.w);
|
||||
|
||||
|
||||
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||||
|
||||
|
@ -831,7 +831,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
}
|
||||
|
||||
if(coul_type != COUL_NONE) {
|
||||
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||||
const F_CFLOAT qiqj = qtmp * fetchQ(j);
|
||||
|
||||
if(qiqj * qiqj > 1e-8) {
|
||||
const bool in_coul_cutoff =
|
||||
|
@ -848,7 +848,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
break;
|
||||
|
||||
case COUL_CUT: {
|
||||
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul;
|
||||
|
@ -859,11 +859,11 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
break;
|
||||
|
||||
case COUL_DEBYE: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_FLOAT r = _RSQRT_(r2inv);
|
||||
const X_FLOAT rinv = F_F(1.0) / r;
|
||||
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||||
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_CFLOAT r = _RSQRT_(r2inv);
|
||||
const X_CFLOAT rinv = F_F(1.0) / r;
|
||||
const F_CFLOAT screening = _EXP_(-_kappa * r);
|
||||
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul * rinv;
|
||||
|
@ -879,14 +879,14 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
break;
|
||||
|
||||
case COUL_LONG: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r = _RSQRT_(r2inv);
|
||||
const F_FLOAT grij = _g_ewald * r;
|
||||
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r = _RSQRT_(r2inv);
|
||||
const F_CFLOAT grij = _g_ewald * r;
|
||||
const F_CFLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
|
||||
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||||
|
||||
|
@ -909,7 +909,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
|
||||
if(in_cutoff) {
|
||||
F_FLOAT dxfp, dyfp, dzfp;
|
||||
F_CFLOAT dxfp, dyfp, dzfp;
|
||||
fxtmp += dxfp = delx * fpair;
|
||||
fytmp += dyfp = dely * fpair;
|
||||
fztmp += dzfp = delz * fpair;
|
||||
|
@ -929,10 +929,10 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
__syncthreads();
|
||||
|
||||
if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
|
||||
F_FLOAT* my_f;
|
||||
F_CFLOAT* my_f;
|
||||
|
||||
if(_collect_forces_later) {
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(eflag) {
|
||||
buffer = &buffer[1 * gridDim.x * gridDim.y];
|
||||
|
@ -945,7 +945,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||||
}
|
||||
|
||||
my_f = (F_FLOAT*) buffer;
|
||||
my_f = (F_CFLOAT*) buffer;
|
||||
my_f += i;
|
||||
*my_f = fxtmp;
|
||||
my_f += _nmax;
|
||||
|
@ -998,14 +998,14 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
|
||||
return;
|
||||
|
||||
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||||
F_FLOAT3* sharedVirial1;
|
||||
F_FLOAT3* sharedVirial2;
|
||||
F_FLOAT* sharedEnergy;
|
||||
F_FLOAT* sharedEnergyCoul;
|
||||
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
|
||||
F_CFLOAT3* sharedVirial1;
|
||||
F_CFLOAT3* sharedVirial2;
|
||||
F_CFLOAT* sharedEnergy;
|
||||
F_CFLOAT* sharedEnergyCoul;
|
||||
|
||||
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
|
||||
F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];
|
||||
|
||||
if(vflag) {
|
||||
sharedVirial1 = &sharedForce[64];
|
||||
|
@ -1017,25 +1017,25 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
if(eflag) {
|
||||
if(vflag || vflag_atom)
|
||||
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
|
||||
sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
|
||||
else
|
||||
sharedEnergy = (F_FLOAT*) &sharedForce[64];
|
||||
sharedEnergy = (F_CFLOAT*) &sharedForce[64];
|
||||
|
||||
if(coul_type != COUL_NONE)
|
||||
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
|
||||
sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];
|
||||
|
||||
}
|
||||
|
||||
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||||
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_FLOAT4 myxtype;
|
||||
F_FLOAT delx, dely, delz;
|
||||
F_FLOAT factor_lj, factor_coul;
|
||||
F_FLOAT fpair;
|
||||
F_FLOAT qtmp;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT4 myxtype;
|
||||
F_CFLOAT delx, dely, delz;
|
||||
F_CFLOAT factor_lj, factor_coul;
|
||||
F_CFLOAT fpair;
|
||||
F_CFLOAT qtmp;
|
||||
int itype, jnum, i, j;
|
||||
int* jlist;
|
||||
|
||||
|
@ -1074,7 +1074,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
delz = ztmp - myxtype.z;
|
||||
int jtype = static_cast <int>(myxtype.w);
|
||||
|
||||
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||||
bool in_coul_cutoff;
|
||||
|
@ -1132,7 +1132,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
}
|
||||
|
||||
if(coul_type != COUL_NONE) {
|
||||
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||||
const F_CFLOAT qiqj = qtmp * fetchQ(j);
|
||||
|
||||
if(qiqj * qiqj > (1e-8f)) {
|
||||
in_coul_cutoff =
|
||||
|
@ -1153,14 +1153,14 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
break;
|
||||
|
||||
case COUL_LONG: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r = _RSQRT_(r2inv);
|
||||
const F_FLOAT grij = _g_ewald * r;
|
||||
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r = _RSQRT_(r2inv);
|
||||
const F_CFLOAT grij = _g_ewald * r;
|
||||
const F_CFLOAT expm2 = _EXP_(-grij * grij);
|
||||
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||||
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||||
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||||
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
|
||||
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||||
|
||||
|
@ -1175,11 +1175,11 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
break;
|
||||
|
||||
case COUL_DEBYE: {
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_FLOAT r = _RSQRT_(r2inv);
|
||||
const X_FLOAT rinv = F_F(1.0) / r;
|
||||
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||||
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const X_CFLOAT r = _RSQRT_(r2inv);
|
||||
const X_CFLOAT rinv = F_F(1.0) / r;
|
||||
const F_CFLOAT screening = _EXP_(-_kappa * r);
|
||||
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul * rinv;
|
||||
|
@ -1191,7 +1191,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
break;
|
||||
|
||||
case COUL_CUT: {
|
||||
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||||
|
||||
if(eflag) {
|
||||
ecoul += forcecoul;
|
||||
|
@ -1210,7 +1210,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
|
||||
if(in_cutoff || in_coul_cutoff) {
|
||||
F_FLOAT dxfp, dyfp, dzfp;
|
||||
F_CFLOAT dxfp, dyfp, dzfp;
|
||||
partialForce.x += dxfp = delx * fpair;
|
||||
partialForce.y += dyfp = dely * fpair;
|
||||
partialForce.z += dzfp = delz * fpair;
|
||||
|
@ -1274,10 +1274,10 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
if(threadIdx.x == 0) {
|
||||
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(eflag) {
|
||||
ENERGY_FLOAT tmp_evdwl;
|
||||
ENERGY_CFLOAT tmp_evdwl;
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
|
||||
|
||||
if(eflag_atom)
|
||||
|
@ -1296,7 +1296,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
}
|
||||
|
||||
if(vflag) {
|
||||
ENERGY_FLOAT tmp;
|
||||
ENERGY_CFLOAT tmp;
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
|
||||
|
||||
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
|
||||
|
@ -1324,10 +1324,10 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
|
|||
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||||
}
|
||||
|
||||
F_FLOAT* my_f;
|
||||
F_CFLOAT* my_f;
|
||||
|
||||
if(_collect_forces_later) {
|
||||
my_f = (F_FLOAT*) buffer;
|
||||
my_f = (F_CFLOAT*) buffer;
|
||||
my_f += i;
|
||||
*my_f = sharedForce[0].x;
|
||||
my_f += _nmax;
|
||||
|
@ -1350,7 +1350,7 @@ __global__ void Pair_GenerateXType_Kernel()
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nall) {
|
||||
X_FLOAT4 xtype;
|
||||
X_CFLOAT4 xtype;
|
||||
xtype.x = _x[i];
|
||||
xtype.y = _x[i + _nmax];
|
||||
xtype.z = _x[i + 2 * _nmax];
|
||||
|
@ -1365,7 +1365,7 @@ __global__ void Pair_GenerateVRadius_Kernel()
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nall) {
|
||||
V_FLOAT4 vradius;
|
||||
V_CFLOAT4 vradius;
|
||||
vradius.x = _v[i];
|
||||
vradius.y = _v[i + _nmax];
|
||||
vradius.z = _v[i + 2 * _nmax];
|
||||
|
@ -1379,7 +1379,7 @@ __global__ void Pair_GenerateOmegaRmass_Kernel()
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nall) {
|
||||
V_FLOAT4 omegarmass;
|
||||
V_CFLOAT4 omegarmass;
|
||||
omegarmass.x = _omega[i];
|
||||
omegarmass.y = _omega[i + _nmax];
|
||||
omegarmass.z = _omega[i + 2 * _nmax];
|
||||
|
@ -1393,7 +1393,7 @@ __global__ void Pair_RevertXType_Kernel()
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nall) {
|
||||
X_FLOAT4 xtype = _x_type[i];
|
||||
X_CFLOAT4 xtype = _x_type[i];
|
||||
_x[i] = xtype.x;
|
||||
_x[i + _nmax] = xtype.y;
|
||||
_x[i + 2 * _nmax] = xtype.z;
|
||||
|
@ -1407,7 +1407,7 @@ __global__ void Pair_BuildXHold_Kernel()
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nall) {
|
||||
X_FLOAT4 xtype = _x_type[i];
|
||||
X_CFLOAT4 xtype = _x_type[i];
|
||||
_xhold[i] = xtype.x;
|
||||
_xhold[i + _nmax] = xtype.y;
|
||||
_xhold[i + 2 * _nmax] = xtype.z;
|
||||
|
@ -1421,10 +1421,10 @@ __global__ void Pair_CollectForces_Kernel(int nperblock, int n)
|
|||
|
||||
if(i >= _nlocal) return;
|
||||
|
||||
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
|
||||
F_FLOAT* my_f = _f + i;
|
||||
F_CFLOAT* buf_f = (F_CFLOAT*) &buf[nperblock * n];
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
buf_f += i;
|
||||
*my_f += * buf_f;
|
||||
my_f += _nmax;
|
||||
|
|
|
@ -21,12 +21,12 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ ENERGY_FLOAT sharedmem[];
|
||||
extern __shared__ ENERGY_CFLOAT sharedmem[];
|
||||
|
||||
static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0)
|
||||
{
|
||||
__syncthreads();
|
||||
ENERGY_FLOAT* shared = sharedmem;
|
||||
ENERGY_CFLOAT* shared = sharedmem;
|
||||
|
||||
if(eflag) {
|
||||
reduceBlock(shared);
|
||||
|
@ -49,7 +49,7 @@ static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, i
|
|||
|
||||
if(threadIdx.x == 0) {
|
||||
shared = sharedmem;
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(eflag) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
|
||||
|
@ -79,8 +79,8 @@ static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, i
|
|||
__global__ void MY_AP(PairVirialCompute_reduce)(int n)
|
||||
{
|
||||
sharedmem[threadIdx.x] = ENERGY_F(0.0);
|
||||
ENERGY_FLOAT sum = ENERGY_F(0.0);
|
||||
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT sum = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
//if(blockIdx.x==2) buf=&buf[n];
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
#define CUDA_PRECISION_H_
|
||||
/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
|
||||
* Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
|
||||
* ***_FLOAT: type definition of given property
|
||||
* ***_CFLOAT: type definition of given property
|
||||
* ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
|
||||
*/
|
||||
|
||||
|
@ -39,17 +39,17 @@
|
|||
|
||||
#ifdef CUDA_PRECISION
|
||||
#if CUDA_PRECISION == 1
|
||||
#define CUDA_FLOAT float
|
||||
#define CUDA_CFLOAT float
|
||||
#define CUDA_F(x) x##f
|
||||
#endif
|
||||
#if CUDA_PRECISION == 2
|
||||
#define CUDA_FLOAT double
|
||||
#define CUDA_CFLOAT double
|
||||
#define CUDA_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef CUDA_PRECISION
|
||||
#define CUDA_FLOAT double
|
||||
#define CUDA_CFLOAT double
|
||||
#define CUDA_F(x) x
|
||||
#define CUDA_PRECISION 2
|
||||
#endif
|
||||
|
@ -59,17 +59,17 @@
|
|||
|
||||
#ifdef FFT_PRECISION_CU
|
||||
#if FFT_PRECISION_CU == 1
|
||||
#define FFT_FLOAT float
|
||||
#define FFT_CFLOAT float
|
||||
#define FFT_F(x) x##f
|
||||
#endif
|
||||
#if FFT_PRECISION_CU == 2
|
||||
#define FFT_FLOAT double
|
||||
#define FFT_CFLOAT double
|
||||
#define FFT_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef FFT_PRECISION_CU
|
||||
#define FFT_FLOAT CUDA_FLOAT
|
||||
#define FFT_CFLOAT CUDA_CFLOAT
|
||||
#define FFT_F(x) CUDA_F(x)
|
||||
#define FFT_PRECISION_CU CUDA_PRECISION
|
||||
#endif
|
||||
|
@ -84,24 +84,24 @@
|
|||
|
||||
#ifdef PPPM_PRECISION
|
||||
#if PPPM_PRECISION == 1
|
||||
#define PPPM_FLOAT float
|
||||
#define PPPM_CFLOAT float
|
||||
#ifdef float3
|
||||
#define PPPM_FLOAT3 float3
|
||||
#define PPPM_CFLOAT3 float3
|
||||
#else
|
||||
struct PPPM_FLOAT3 {
|
||||
PPPM_FLOAT x;
|
||||
PPPM_FLOAT y;
|
||||
PPPM_FLOAT z;
|
||||
struct PPPM_CFLOAT3 {
|
||||
PPPM_CFLOAT x;
|
||||
PPPM_CFLOAT y;
|
||||
PPPM_CFLOAT z;
|
||||
};
|
||||
#endif
|
||||
#define PPPM_F(x) x##f
|
||||
#endif
|
||||
#if PPPM_PRECISION == 2
|
||||
#define PPPM_FLOAT double
|
||||
struct PPPM_FLOAT3 {
|
||||
PPPM_FLOAT x;
|
||||
PPPM_FLOAT y;
|
||||
PPPM_FLOAT z;
|
||||
#define PPPM_CFLOAT double
|
||||
struct PPPM_CFLOAT3 {
|
||||
PPPM_CFLOAT x;
|
||||
PPPM_CFLOAT y;
|
||||
PPPM_CFLOAT z;
|
||||
};
|
||||
#define PPPM_F(x) x
|
||||
#endif
|
||||
|
@ -115,17 +115,17 @@ struct PPPM_FLOAT3 {
|
|||
|
||||
#ifdef F_PRECISION
|
||||
#if F_PRECISION == 1
|
||||
#define F_FLOAT float
|
||||
#define F_CFLOAT float
|
||||
#define F_F(x) x##f
|
||||
#endif
|
||||
#if F_PRECISION == 2
|
||||
#define F_FLOAT double
|
||||
#define F_CFLOAT double
|
||||
#define F_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef F_PRECISION
|
||||
#define F_FLOAT CUDA_FLOAT
|
||||
#define F_CFLOAT CUDA_CFLOAT
|
||||
#define F_F(x) CUDA_F(x)
|
||||
#define F_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
@ -141,48 +141,49 @@ struct PPPM_FLOAT3 {
|
|||
#endif
|
||||
|
||||
#if F_PRECISION == 2
|
||||
struct F_FLOAT2 {
|
||||
F_FLOAT x;
|
||||
F_FLOAT y;
|
||||
struct F_CFLOAT2 {
|
||||
F_CFLOAT x;
|
||||
F_CFLOAT y;
|
||||
};
|
||||
struct F_FLOAT3 {
|
||||
F_FLOAT x;
|
||||
F_FLOAT y;
|
||||
F_FLOAT z;
|
||||
struct F_CFLOAT3 {
|
||||
F_CFLOAT x;
|
||||
F_CFLOAT y;
|
||||
F_CFLOAT z;
|
||||
};
|
||||
struct F_FLOAT4 {
|
||||
F_FLOAT x;
|
||||
F_FLOAT y;
|
||||
F_FLOAT z;
|
||||
F_FLOAT w;
|
||||
struct F_CFLOAT4 {
|
||||
F_CFLOAT x;
|
||||
F_CFLOAT y;
|
||||
F_CFLOAT z;
|
||||
F_CFLOAT w;
|
||||
};
|
||||
#else
|
||||
#define F_FLOAT2 float2
|
||||
#define F_FLOAT3 float3
|
||||
#define F_FLOAT4 float4
|
||||
#define F_CFLOAT2 float2
|
||||
#define F_CFLOAT3 float3
|
||||
#define F_CFLOAT4 float4
|
||||
#endif
|
||||
|
||||
//--------------------------------
|
||||
//-----------ENERGY-----------------
|
||||
//--------------------------------
|
||||
|
||||
#ifndef ENERGY_PRECISION
|
||||
#define ENERGY_FLOAT CUDA_FLOAT
|
||||
#define ENERGY_CFLOAT CUDA_CFLOAT
|
||||
#define ENERGY_F(x) CUDA_F(x)
|
||||
#endif
|
||||
|
||||
#ifdef ENERGY_PRECISION
|
||||
#if ENERGY_PRECISION == 1
|
||||
#define ENERGY_FLOAT float
|
||||
#define ENERGY_CFLOAT float
|
||||
#define ENERGY_F(x) x##f
|
||||
#endif
|
||||
#if ENERGY_PRECISION == 2
|
||||
#define ENERGY_FLOAT double
|
||||
#define ENERGY_CFLOAT double
|
||||
#define ENERGY_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ENERGY_PRECISION
|
||||
#define ENERGY_FLOAT CUDA_FLOAT
|
||||
#define ENERGY_CFLOAT CUDA_CFLOAT
|
||||
#define ENERGY_F(x) CUDA_F(x)
|
||||
#define ENERGY_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
@ -193,41 +194,41 @@ struct F_FLOAT4 {
|
|||
|
||||
#ifdef X_PRECISION
|
||||
#if X_PRECISION == 1
|
||||
#define X_FLOAT float
|
||||
#define X_CFLOAT float
|
||||
#define X_F(x) x##f
|
||||
#endif
|
||||
#if X_PRECISION == 2
|
||||
#define X_FLOAT double
|
||||
#define X_CFLOAT double
|
||||
#define X_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef X_PRECISION
|
||||
#define X_FLOAT CUDA_FLOAT
|
||||
#define X_CFLOAT CUDA_CFLOAT
|
||||
#define X_F(x) CUDA_F(x)
|
||||
#define X_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
#if X_PRECISION == 2
|
||||
struct X_FLOAT2 {
|
||||
X_FLOAT x;
|
||||
X_FLOAT y;
|
||||
struct X_CFLOAT2 {
|
||||
X_CFLOAT x;
|
||||
X_CFLOAT y;
|
||||
};
|
||||
struct X_FLOAT3 {
|
||||
X_FLOAT x;
|
||||
X_FLOAT y;
|
||||
X_FLOAT z;
|
||||
struct X_CFLOAT3 {
|
||||
X_CFLOAT x;
|
||||
X_CFLOAT y;
|
||||
X_CFLOAT z;
|
||||
};
|
||||
struct X_FLOAT4 {
|
||||
X_FLOAT x;
|
||||
X_FLOAT y;
|
||||
X_FLOAT z;
|
||||
X_FLOAT w;
|
||||
struct X_CFLOAT4 {
|
||||
X_CFLOAT x;
|
||||
X_CFLOAT y;
|
||||
X_CFLOAT z;
|
||||
X_CFLOAT w;
|
||||
};
|
||||
#else
|
||||
#define X_FLOAT2 float2
|
||||
#define X_FLOAT3 float3
|
||||
#define X_FLOAT4 float4
|
||||
#define X_CFLOAT2 float2
|
||||
#define X_CFLOAT3 float3
|
||||
#define X_CFLOAT4 float4
|
||||
#endif
|
||||
|
||||
//--------------------------------
|
||||
|
@ -236,30 +237,30 @@ struct X_FLOAT4 {
|
|||
|
||||
#ifdef V_PRECISION
|
||||
#if V_PRECISION == 1
|
||||
#define V_FLOAT float
|
||||
#define V_CFLOAT float
|
||||
#define V_F(x) x##f
|
||||
#endif
|
||||
#if V_PRECISION == 2
|
||||
#define V_FLOAT double
|
||||
#define V_CFLOAT double
|
||||
#define V_F(x) x
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef V_PRECISION
|
||||
#define V_FLOAT CUDA_FLOAT
|
||||
#define V_CFLOAT CUDA_CFLOAT
|
||||
#define V_F(x) CUDA_F(x)
|
||||
#define V_PRECISION CUDA_PRECISION
|
||||
#endif
|
||||
|
||||
#if V_PRECISION == 2
|
||||
struct V_FLOAT4 {
|
||||
V_FLOAT x;
|
||||
V_FLOAT y;
|
||||
V_FLOAT z;
|
||||
V_FLOAT w;
|
||||
struct V_CFLOAT4 {
|
||||
V_CFLOAT x;
|
||||
V_CFLOAT y;
|
||||
V_CFLOAT z;
|
||||
V_CFLOAT w;
|
||||
};
|
||||
#else
|
||||
#define V_FLOAT4 float4
|
||||
#define V_CFLOAT4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NO_PREC_TIMING
|
||||
|
|
|
@ -61,9 +61,9 @@ struct cuda_shared_atom { // relevent data from atom class
|
|||
int need_eatom;
|
||||
int need_vatom;
|
||||
|
||||
dev_array x_type; // position + type in X_FLOAT4 struct
|
||||
dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
|
||||
dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
|
||||
dev_array x_type; // position + type in X_CFLOAT4 struct
|
||||
dev_array v_radius; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
|
||||
dev_array omega_rmass; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
|
||||
|
||||
double* mass_host; // remember per-type host pointer to masses
|
||||
//int natoms; // total # of atoms in system, could be 0
|
||||
|
@ -82,7 +82,7 @@ struct cuda_shared_atom { // relevent data from atom class
|
|||
int update_neigh;
|
||||
|
||||
dev_array xhold; // position at last neighboring
|
||||
X_FLOAT triggerneighsq; // maximum square movement before reneighboring
|
||||
X_CFLOAT triggerneighsq; // maximum square movement before reneighboring
|
||||
int reneigh_flag; // is reneighboring necessary
|
||||
int maxhold; // size of xhold
|
||||
int dist_check; //perform distance check for reneighboring
|
||||
|
@ -96,9 +96,9 @@ struct cuda_shared_atom { // relevent data from atom class
|
|||
|
||||
struct cuda_shared_pair { // relevent data from pair class
|
||||
char cudable_force; // check for (cudable_force!=0)
|
||||
X_FLOAT cut_global;
|
||||
X_FLOAT cut_inner_global;
|
||||
X_FLOAT cut_coul_global;
|
||||
X_CFLOAT cut_global;
|
||||
X_CFLOAT cut_inner_global;
|
||||
X_CFLOAT cut_coul_global;
|
||||
double** cut; // type-type cutoff
|
||||
double** cutsq; // type-type cutoff
|
||||
double** cut_inner; // type-type cutoff for coul
|
||||
|
@ -116,11 +116,11 @@ struct cuda_shared_pair { // relevent data from pair class
|
|||
double** offset;
|
||||
double* special_lj;
|
||||
double* special_coul;
|
||||
dev_array virial; // ENERGY_FLOAT
|
||||
dev_array eng_vdwl; // ENERGY_FLOAT
|
||||
dev_array eng_coul; // ENERGY_FLOAT
|
||||
X_FLOAT cut_coulsq_global;
|
||||
F_FLOAT g_ewald, kappa;
|
||||
dev_array virial; // ENERGY_CFLOAT
|
||||
dev_array eng_vdwl; // ENERGY_CFLOAT
|
||||
dev_array eng_coul; // ENERGY_CFLOAT
|
||||
X_CFLOAT cut_coulsq_global;
|
||||
F_CFLOAT g_ewald, kappa;
|
||||
int freeze_group_bit;
|
||||
|
||||
dev_array coeff1_gm;
|
||||
|
@ -144,48 +144,48 @@ struct cuda_shared_pair { // relevent data from pair class
|
|||
};
|
||||
|
||||
struct cuda_shared_domain { // relevent data from domain class
|
||||
X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
|
||||
X_FLOAT subhi[3];
|
||||
X_FLOAT boxlo[3];
|
||||
X_FLOAT boxhi[3];
|
||||
X_FLOAT prd[3];
|
||||
X_CFLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
|
||||
X_CFLOAT subhi[3];
|
||||
X_CFLOAT boxlo[3];
|
||||
X_CFLOAT boxhi[3];
|
||||
X_CFLOAT prd[3];
|
||||
int periodicity[3]; // xyz periodicity as array
|
||||
|
||||
int triclinic;
|
||||
X_FLOAT xy;
|
||||
X_FLOAT xz;
|
||||
X_FLOAT yz;
|
||||
X_FLOAT boxlo_lamda[3];
|
||||
X_FLOAT boxhi_lamda[3];
|
||||
X_FLOAT prd_lamda[3];
|
||||
X_FLOAT h[6];
|
||||
X_FLOAT h_inv[6];
|
||||
V_FLOAT h_rate[6];
|
||||
X_CFLOAT xy;
|
||||
X_CFLOAT xz;
|
||||
X_CFLOAT yz;
|
||||
X_CFLOAT boxlo_lamda[3];
|
||||
X_CFLOAT boxhi_lamda[3];
|
||||
X_CFLOAT prd_lamda[3];
|
||||
X_CFLOAT h[6];
|
||||
X_CFLOAT h_inv[6];
|
||||
V_CFLOAT h_rate[6];
|
||||
int update;
|
||||
};
|
||||
|
||||
struct cuda_shared_pppm {
|
||||
char cudable_force;
|
||||
#ifdef FFT_CUFFT
|
||||
FFT_FLOAT* work1;
|
||||
FFT_FLOAT* work2;
|
||||
FFT_FLOAT* work3;
|
||||
PPPM_FLOAT* greensfn;
|
||||
PPPM_FLOAT* fkx;
|
||||
PPPM_FLOAT* fky;
|
||||
PPPM_FLOAT* fkz;
|
||||
PPPM_FLOAT* vg;
|
||||
FFT_CFLOAT* work1;
|
||||
FFT_CFLOAT* work2;
|
||||
FFT_CFLOAT* work3;
|
||||
PPPM_CFLOAT* greensfn;
|
||||
PPPM_CFLOAT* fkx;
|
||||
PPPM_CFLOAT* fky;
|
||||
PPPM_CFLOAT* fkz;
|
||||
PPPM_CFLOAT* vg;
|
||||
#endif
|
||||
int* part2grid;
|
||||
PPPM_FLOAT* density_brick;
|
||||
PPPM_CFLOAT* density_brick;
|
||||
int* density_brick_int;
|
||||
PPPM_FLOAT density_intScale;
|
||||
PPPM_FLOAT* vdx_brick;
|
||||
PPPM_FLOAT* vdy_brick;
|
||||
PPPM_FLOAT* vdz_brick;
|
||||
PPPM_FLOAT* density_fft;
|
||||
ENERGY_FLOAT* energy;
|
||||
ENERGY_FLOAT* virial;
|
||||
PPPM_CFLOAT density_intScale;
|
||||
PPPM_CFLOAT* vdx_brick;
|
||||
PPPM_CFLOAT* vdy_brick;
|
||||
PPPM_CFLOAT* vdz_brick;
|
||||
PPPM_CFLOAT* density_fft;
|
||||
ENERGY_CFLOAT* energy;
|
||||
ENERGY_CFLOAT* virial;
|
||||
int nxlo_in;
|
||||
int nxhi_in;
|
||||
int nxlo_out;
|
||||
|
@ -201,20 +201,20 @@ struct cuda_shared_pppm {
|
|||
int nx_pppm;
|
||||
int ny_pppm;
|
||||
int nz_pppm;
|
||||
PPPM_FLOAT qqrd2e;
|
||||
PPPM_CFLOAT qqrd2e;
|
||||
int order;
|
||||
// float3 sublo;
|
||||
PPPM_FLOAT* rho_coeff;
|
||||
PPPM_CFLOAT* rho_coeff;
|
||||
int nmax;
|
||||
int nlocal;
|
||||
PPPM_FLOAT* debugdata;
|
||||
PPPM_FLOAT delxinv;
|
||||
PPPM_FLOAT delyinv;
|
||||
PPPM_FLOAT delzinv;
|
||||
PPPM_CFLOAT* debugdata;
|
||||
PPPM_CFLOAT delxinv;
|
||||
PPPM_CFLOAT delyinv;
|
||||
PPPM_CFLOAT delzinv;
|
||||
int nlower;
|
||||
int nupper;
|
||||
PPPM_FLOAT shiftone;
|
||||
PPPM_FLOAT3* fH;
|
||||
PPPM_CFLOAT shiftone;
|
||||
PPPM_CFLOAT3* fH;
|
||||
};
|
||||
|
||||
struct cuda_shared_comm {
|
||||
|
@ -262,7 +262,7 @@ struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cu
|
|||
int maxneighbors;
|
||||
int neigh_lists_per_page;
|
||||
double** cutneighsq;
|
||||
CUDA_FLOAT* cu_cutneighsq;
|
||||
CUDA_CFLOAT* cu_cutneighsq;
|
||||
int* binned_id;
|
||||
int* bin_dim;
|
||||
int bin_nmax;
|
||||
|
|
|
@ -49,8 +49,8 @@ void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
|
|||
{
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
|
||||
|
@ -58,19 +58,19 @@ void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
|
|||
|
||||
void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , 3 * sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(triclinic) , & sdata->domain.triclinic , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata , sizeof(int*));
|
||||
}
|
||||
|
@ -94,15 +94,15 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group
|
|||
|
||||
int sharedmem = 0;
|
||||
|
||||
if(box_change) sharedmem = 6 * sizeof(X_FLOAT);
|
||||
if(box_change) sharedmem = 6 * sizeof(X_CFLOAT);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, sharedmem);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
sharedmem *= threads.x;
|
||||
|
||||
if((box_change) && (sdata->buffer_new or (6 * sizeof(X_FLOAT)*grid.x * grid.y > sdata->buffersize)))
|
||||
Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_FLOAT));
|
||||
if((box_change) && (sdata->buffer_new or (6 * sizeof(X_CFLOAT)*grid.x * grid.y > sdata->buffersize)))
|
||||
Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_CFLOAT));
|
||||
|
||||
|
||||
Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change);
|
||||
|
@ -111,13 +111,13 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group
|
|||
CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
|
||||
|
||||
if(box_change) {
|
||||
X_FLOAT buf2[6 * layout.x * layout.y];
|
||||
X_FLOAT* buf = buf2;
|
||||
X_CFLOAT buf2[6 * layout.x * layout.y];
|
||||
X_CFLOAT* buf = buf2;
|
||||
int flag;
|
||||
cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//printf("Flag: %i\n",flag);
|
||||
X_FLOAT min, max;
|
||||
X_CFLOAT min, max;
|
||||
min = 1.0 * BIG;
|
||||
max = -1.0 * BIG;
|
||||
|
||||
|
@ -160,7 +160,7 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group
|
|||
if(n<128) threads.x=32;
|
||||
else if(n<256) threads.x=64;
|
||||
else threads.x=128;
|
||||
sharedmem=n*sizeof(X_FLOAT);
|
||||
sharedmem=n*sizeof(X_CFLOAT);
|
||||
grid.x=6;
|
||||
grid.y=1;
|
||||
Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ X_FLOAT sharedmem[];
|
||||
extern __shared__ X_CFLOAT sharedmem[];
|
||||
|
||||
#define BIG 1e10
|
||||
__global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change)
|
||||
|
@ -29,9 +29,9 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
|
|||
int idim, otherdims;
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_FLOAT lo[3];
|
||||
X_FLOAT hi[3];
|
||||
X_FLOAT* period;
|
||||
X_CFLOAT lo[3];
|
||||
X_CFLOAT hi[3];
|
||||
X_CFLOAT* period;
|
||||
|
||||
if(_triclinic == 0) {
|
||||
lo[0] = _boxlo[0];
|
||||
|
@ -54,11 +54,11 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
|
|||
}
|
||||
|
||||
|
||||
X_FLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
|
||||
X_FLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
|
||||
X_FLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
|
||||
X_CFLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
|
||||
X_CFLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
|
||||
X_CFLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
|
||||
|
||||
X_FLOAT* buf = (X_FLOAT*) _buffer;
|
||||
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
|
||||
buf += blockIdx.x * gridDim.y + blockIdx.y;
|
||||
buf[0] = tmpx;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
|
@ -181,12 +181,12 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
|
|||
__syncthreads();
|
||||
|
||||
if(box_change) {
|
||||
X_FLOAT minx = BIG;
|
||||
X_FLOAT maxx = -BIG;
|
||||
X_FLOAT miny = BIG;
|
||||
X_FLOAT maxy = -BIG;
|
||||
X_FLOAT minz = BIG;
|
||||
X_FLOAT maxz = -BIG;
|
||||
X_CFLOAT minx = BIG;
|
||||
X_CFLOAT maxx = -BIG;
|
||||
X_CFLOAT miny = BIG;
|
||||
X_CFLOAT maxy = -BIG;
|
||||
X_CFLOAT minz = BIG;
|
||||
X_CFLOAT maxz = -BIG;
|
||||
|
||||
if(not _periodicity[0]) {
|
||||
sharedmem[threadIdx.x] = tmpx;
|
||||
|
@ -231,7 +231,7 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
|
|||
}
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buf = (X_FLOAT*) _buffer;
|
||||
buf = (X_CFLOAT*) _buffer;
|
||||
buf += blockIdx.x * gridDim.y + blockIdx.y;
|
||||
buf[0] = minx;
|
||||
buf += gridDim.x * gridDim.y;
|
||||
|
@ -250,7 +250,7 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
|
|||
|
||||
__global__ void Domain_reduceBoxExtent(double* extent, int n)
|
||||
{
|
||||
X_FLOAT* buf = (X_FLOAT*) _buffer;
|
||||
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
|
||||
buf += blockIdx.x * n;
|
||||
copyGlobToShared(buf, sharedmem, n);
|
||||
|
||||
|
@ -267,8 +267,8 @@ __global__ void Domain_lamda2x_Kernel(int n)
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
X_FLOAT ytmp = _x[i + _nmax];
|
||||
X_FLOAT ztmp = _x[i + 2 * _nmax];
|
||||
X_CFLOAT ytmp = _x[i + _nmax];
|
||||
X_CFLOAT ztmp = _x[i + 2 * _nmax];
|
||||
_x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0];
|
||||
_x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1];
|
||||
_x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2];
|
||||
|
@ -279,7 +279,7 @@ __global__ void Domain_x2lamda_Kernel(int n)
|
|||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_FLOAT delta[3];
|
||||
X_CFLOAT delta[3];
|
||||
|
||||
if(i < n) {
|
||||
delta[0] = _x[i] - _boxlo[0];
|
||||
|
|
|
@ -25,15 +25,15 @@
|
|||
#include "cuda_precision.h"
|
||||
#include "cuda_common.h"
|
||||
struct FFT_DATA {
|
||||
FFT_FLOAT re;
|
||||
FFT_FLOAT im;
|
||||
FFT_CFLOAT re;
|
||||
FFT_CFLOAT im;
|
||||
};
|
||||
|
||||
#include "fft3d_cuda_cu.h"
|
||||
#include "fft3d_cuda_kernel.cu"
|
||||
#include <stdio.h>
|
||||
|
||||
void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow)
|
||||
void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow)
|
||||
{
|
||||
|
||||
dim3 grid;
|
||||
|
@ -62,7 +62,7 @@ void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
|
|||
threads.x = nfast * 2;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
|
||||
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
|
||||
cudaThreadSynchronize();
|
||||
MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
|
||||
}
|
||||
|
@ -78,7 +78,7 @@ void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
|
|||
threads.x = nfast * 2;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
|
||||
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
|
||||
|
@ -92,7 +92,7 @@ void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, i
|
|||
threads.x = (khi - klo + 1) * 2;
|
||||
threads.y = 1;
|
||||
threads.z = 1;
|
||||
permute_part_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
|
||||
permute_part_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow);
|
||||
extern "C" void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow);
|
||||
extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
|
||||
extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
|
||||
extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo);
|
||||
|
|
|
@ -21,24 +21,24 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void initfftdata_kernel(double* in, FFT_FLOAT* out)
|
||||
__global__ void initfftdata_kernel(double* in, FFT_CFLOAT* out)
|
||||
{
|
||||
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
|
||||
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0;
|
||||
}
|
||||
|
||||
|
||||
__global__ void permute_kernel(FFT_FLOAT* in, FFT_FLOAT* out)
|
||||
__global__ void permute_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
|
||||
{
|
||||
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
|
||||
}
|
||||
|
||||
__global__ void permute_scale_kernel(FFT_FLOAT* in, FFT_FLOAT* out)
|
||||
__global__ void permute_scale_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
|
||||
{
|
||||
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5;
|
||||
}
|
||||
|
||||
__global__ void permute_part_kernel(FFT_FLOAT* in, FFT_FLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
|
||||
__global__ void permute_part_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
|
||||
{
|
||||
{
|
||||
out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo];
|
||||
|
|
|
@ -33,10 +33,10 @@
|
|||
|
||||
void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -55,8 +55,8 @@ void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
|
||||
|
@ -64,7 +64,7 @@ void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
|
|||
Cuda_FixAddForceCuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal)
|
||||
void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixAddForceCuda_UpdateNmax(sdata);
|
||||
|
@ -75,18 +75,18 @@ void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLO
|
|||
if(sdata->buffer_new)
|
||||
Cuda_FixAddForceCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
|
||||
Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 4;
|
||||
threads.x = 512;
|
||||
reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal);
|
||||
reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
|
||||
|
||||
|
|
|
@ -24,4 +24,4 @@
|
|||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal);
|
||||
extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal);
|
||||
|
|
|
@ -21,10 +21,10 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_FLOAT sharedmem[];
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue)
|
||||
__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
@ -51,7 +51,7 @@ __global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
|
|||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
F_FLOAT* buffer = (F_FLOAT*) _buffer;
|
||||
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
|
@ -63,12 +63,12 @@ __global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
|
|||
}
|
||||
|
||||
|
||||
__global__ void reduce_foriginal(int n, F_FLOAT* foriginal)
|
||||
__global__ void reduce_foriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_FLOAT myforig = 0.0;
|
||||
F_FLOAT* buf = (F_FLOAT*) _buffer;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
|
|
|
@ -33,10 +33,10 @@
|
|||
|
||||
void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -55,8 +55,8 @@ void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
|
||||
|
@ -64,7 +64,7 @@ void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
|
|||
Cuda_FixAveForceCuda_UpdateNmax(sdata);
|
||||
}
|
||||
|
||||
void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal)
|
||||
void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixAveForceCuda_UpdateNmax(sdata);
|
||||
|
@ -75,25 +75,25 @@ void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,
|
|||
if(sdata->buffer_new)
|
||||
Cuda_FixAveForceCuda_UpdateBuffer(sdata);
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
|
||||
Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit);
|
||||
Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 4;
|
||||
threads.x = 512;
|
||||
Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal);
|
||||
Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
|
||||
|
||||
}
|
||||
|
||||
void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue)
|
||||
void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
|
|
|
@ -24,5 +24,5 @@
|
|||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal);
|
||||
extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue);
|
||||
extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal);
|
||||
extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue);
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_FLOAT sharedmem[];
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
|
||||
|
@ -44,7 +44,7 @@ __global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
|
|||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
reduceBlock(&sharedmem[3 * blockDim.x]);
|
||||
F_FLOAT* buffer = (F_FLOAT*) _buffer;
|
||||
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
|
@ -55,12 +55,12 @@ __global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
|
|||
}
|
||||
|
||||
|
||||
__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal)
|
||||
__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_FLOAT myforig = 0.0;
|
||||
F_FLOAT* buf = (F_FLOAT*) _buffer;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
|
@ -81,7 +81,7 @@ __global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal)
|
|||
foriginal[blockIdx.x] = myforig;
|
||||
}
|
||||
|
||||
__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue)
|
||||
__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
|
|
@ -34,8 +34,8 @@ void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
|
||||
|
|
|
@ -32,10 +32,10 @@
|
|||
|
||||
void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -55,9 +55,9 @@ void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
|
||||
|
@ -68,7 +68,7 @@ void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
|
|||
}
|
||||
|
||||
|
||||
void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal)
|
||||
void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixFreezeCuda_UpdateNmax(sdata);
|
||||
|
@ -80,18 +80,18 @@ void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT
|
|||
Cuda_FixFreezeCuda_UpdateBuffer(sdata);
|
||||
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit);
|
||||
Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 3;
|
||||
threads.x = 512;
|
||||
Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal);
|
||||
Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
|
||||
|
||||
|
|
|
@ -24,4 +24,4 @@
|
|||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal);
|
||||
extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal);
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_FLOAT sharedmem[];
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
|
||||
|
@ -49,7 +49,7 @@ __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
|
|||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
F_FLOAT* buffer = (F_FLOAT*)_buffer;
|
||||
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
|
@ -59,12 +59,12 @@ __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
|
|||
}
|
||||
|
||||
|
||||
__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal)
|
||||
__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_FLOAT myforig = 0.0;
|
||||
F_FLOAT* buf = (F_FLOAT*)_buffer;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
|
|
|
@ -32,10 +32,10 @@
|
|||
|
||||
void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -55,12 +55,12 @@ void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
|
||||
|
@ -70,7 +70,7 @@ void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
|
|||
}
|
||||
|
||||
|
||||
void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc)
|
||||
void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixGravityCuda_UpdateNmax(sdata);
|
||||
|
|
|
@ -24,4 +24,4 @@
|
|||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc);
|
||||
extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc);
|
||||
|
|
|
@ -21,13 +21,13 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc)
|
||||
__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
F_FLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
|
||||
F_CFLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
|
||||
_f[i] += mass * xacc;
|
||||
_f[i + 1 * _nmax] += mass * yacc;
|
||||
_f[i + 2 * _nmax] += mass * zacc;
|
||||
|
|
|
@ -32,21 +32,21 @@
|
|||
|
||||
void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
|
@ -67,12 +67,12 @@ void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
|
||||
void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
@ -97,8 +97,8 @@ void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* fa
|
|||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_FLOAT3 factor2;
|
||||
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_CFLOAT3 factor2;
|
||||
|
||||
if(p_triclinic) {
|
||||
factor2.x = factor_h[3], factor2.y = factor_h[4];
|
||||
|
@ -125,8 +125,8 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro
|
|||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_FLOAT3 factor2;
|
||||
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_CFLOAT3 factor2;
|
||||
|
||||
if(p_triclinic) {
|
||||
factor2.x = factor_h[3], factor2.y = factor_h[4];
|
||||
|
@ -143,7 +143,7 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro
|
|||
|
||||
}
|
||||
|
||||
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
@ -237,8 +237,8 @@ void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int gro
|
|||
if(sdata->buffer_new)
|
||||
Cuda_FixNHCuda_UpdateBuffer(sdata);
|
||||
|
||||
F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_FLOAT3 factor2;
|
||||
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
|
||||
F_CFLOAT3 factor2;
|
||||
|
||||
if(p_triclinic) {
|
||||
factor2.x = factor_h[3], factor2.y = factor_h[4];
|
||||
|
|
|
@ -23,9 +23,9 @@
|
|||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
|
||||
extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
|
||||
extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
|
|
|
@ -21,14 +21,14 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit)
|
||||
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
|
||||
{
|
||||
if(_dist_check) {
|
||||
|
||||
X_FLOAT d = X_F(0.0);
|
||||
X_CFLOAT d = X_F(0.0);
|
||||
|
||||
if(i < _nlocal) {
|
||||
X_FLOAT tmp = xtmp - _xhold[i];
|
||||
X_CFLOAT tmp = xtmp - _xhold[i];
|
||||
d = tmp * tmp;
|
||||
tmp = ytmp - _xhold[i + _maxhold];
|
||||
d += tmp * tmp;
|
||||
|
@ -43,15 +43,15 @@ static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLO
|
|||
}
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
|
||||
__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
V_FLOAT* my_v = _v + i;
|
||||
V_FLOAT vx = my_v[0];
|
||||
V_FLOAT vy = my_v[_nmax];
|
||||
V_FLOAT vz = my_v[2 * _nmax];
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
V_CFLOAT vx = my_v[0];
|
||||
V_CFLOAT vy = my_v[_nmax];
|
||||
V_CFLOAT vz = my_v[2 * _nmax];
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
|
@ -71,12 +71,12 @@ __global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p
|
|||
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
|
||||
__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_CFLOAT factor_eta)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
V_FLOAT* my_v = _v + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
my_v[0] *= factor_eta;
|
||||
my_v[_nmax] *= factor_eta;
|
||||
my_v[2 * _nmax] *= factor_eta;
|
||||
|
@ -84,22 +84,22 @@ __global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
|
|||
|
||||
}
|
||||
|
||||
__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
|
||||
__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_FLOAT* my_f = _f + i;
|
||||
V_FLOAT* my_v = _v + i;
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_FLOAT dtfm = _dtf;
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
V_FLOAT vx = my_v[0];
|
||||
V_FLOAT vy = my_v[_nmax];
|
||||
V_FLOAT vz = my_v[2 * _nmax];
|
||||
V_CFLOAT vx = my_v[0];
|
||||
V_CFLOAT vy = my_v[_nmax];
|
||||
V_CFLOAT vz = my_v[2 * _nmax];
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
vz *= factor.z;
|
||||
|
@ -125,10 +125,10 @@ __global__ void FixNHCuda_nve_v_Kernel(int groupbit)
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_FLOAT* my_f = _f + i;
|
||||
V_FLOAT* my_v = _v + i;
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_FLOAT dtfm = _dtf;
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
@ -145,13 +145,13 @@ __global__ void FixNHCuda_nve_v_Kernel(int groupbit)
|
|||
|
||||
__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
|
||||
{
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
V_FLOAT* my_v = _v + i;
|
||||
X_FLOAT* my_x = _x + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
|
||||
xtmp = *my_x += _dtv * *my_v;
|
||||
my_v += _nmax;
|
||||
|
@ -166,23 +166,23 @@ __global__ void FixNHCuda_nve_x_Kernel(int groupbit)
|
|||
}
|
||||
|
||||
|
||||
__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
|
||||
__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
|
||||
{
|
||||
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_FLOAT* my_f = _f + i;
|
||||
V_FLOAT* my_v = _v + i;
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_FLOAT dtfm = _dtf;
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
V_FLOAT vx = my_v[0] + dtfm * my_f[0];
|
||||
V_FLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
|
||||
V_FLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
|
||||
V_CFLOAT vx = my_v[0] + dtfm * my_f[0];
|
||||
V_CFLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
|
||||
V_CFLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
|
||||
|
||||
vx *= factor.x;
|
||||
vy *= factor.y;
|
||||
|
|
|
@ -32,19 +32,19 @@
|
|||
|
||||
void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
|
@ -65,12 +65,12 @@ void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
|
||||
}
|
||||
|
||||
void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
|
||||
void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
|
||||
Cuda_FixNVECuda_UpdateNmax(sdata);
|
||||
|
|
|
@ -23,6 +23,6 @@
|
|||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
|
||||
extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
|
||||
extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
|
||||
extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
|
||||
|
|
|
@ -21,11 +21,11 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit)
|
||||
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
|
||||
{
|
||||
if(_dist_check) {
|
||||
X_FLOAT tmp = xtmp - _xhold[i];
|
||||
X_FLOAT d = tmp * tmp;
|
||||
X_CFLOAT tmp = xtmp - _xhold[i];
|
||||
X_CFLOAT d = tmp * tmp;
|
||||
tmp = ytmp - _xhold[i + _maxhold];
|
||||
d += tmp * tmp;
|
||||
tmp = ztmp - _xhold[i + 2 * _maxhold];
|
||||
|
@ -41,7 +41,7 @@ static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLO
|
|||
|
||||
__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
|
||||
{
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
#ifdef CUDA_USE_BINNING
|
||||
|
||||
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
|
||||
|
@ -50,16 +50,16 @@ __global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
|
|||
const int i = 3 * blockDim.x * bin + threadIdx.x;
|
||||
|
||||
if(_mask[i] & groupbit) {
|
||||
F_FLOAT* my_f = _binned_f + i;
|
||||
V_FLOAT* my_v = _binned_v + i;
|
||||
X_FLOAT* my_x = _binned_x + i;
|
||||
F_CFLOAT* my_f = _binned_f + i;
|
||||
V_CFLOAT* my_v = _binned_v + i;
|
||||
X_CFLOAT* my_x = _binned_x + i;
|
||||
|
||||
V_FLOAT dtfm = _dtf
|
||||
V_CFLOAT dtfm = _dtf
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
|
||||
|
||||
V_FLOAT v_mem;
|
||||
V_CFLOAT v_mem;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
xtmp = *my_x += _dtv * v_mem;
|
||||
my_f += blockDim.x;
|
||||
|
@ -80,16 +80,16 @@ __global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_FLOAT* my_f = _f + i;
|
||||
V_FLOAT* my_v = _v + i;
|
||||
X_FLOAT* my_x = _x + i;
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
|
||||
V_FLOAT dtfm = _dtf;
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
||||
V_FLOAT v_mem;
|
||||
V_CFLOAT v_mem;
|
||||
v_mem = *my_v += dtfm * (*my_f);
|
||||
xtmp = *my_x += _dtv * v_mem;
|
||||
my_f += _nmax;
|
||||
|
@ -119,10 +119,10 @@ __global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
|
|||
const int i = 3 * blockDim.x * bin + threadIdx.x;
|
||||
|
||||
if(_mask[i] & groupbit) {
|
||||
F_FLOAT* my_f = _binned_f + i;
|
||||
V_FLOAT* my_v = _binned_v + i;
|
||||
F_CFLOAT* my_f = _binned_f + i;
|
||||
V_CFLOAT* my_v = _binned_v + i;
|
||||
|
||||
V_FLOAT dtfm = _dtf
|
||||
V_CFLOAT dtfm = _dtf
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
|
||||
|
@ -142,10 +142,10 @@ __global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal && _mask[i] & groupbit) {
|
||||
F_FLOAT* my_f = _f + i;
|
||||
V_FLOAT* my_v = _v + i;
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
|
||||
V_FLOAT dtfm = _dtf;
|
||||
V_CFLOAT dtfm = _dtf;
|
||||
|
||||
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
|
||||
else dtfm *= V_F(1.0) / _mass[_type[i]];
|
||||
|
|
|
@ -32,10 +32,10 @@
|
|||
|
||||
void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
|
||||
{
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
|
||||
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -55,8 +55,8 @@ void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
|
||||
|
@ -66,7 +66,7 @@ void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
|
|||
}
|
||||
|
||||
|
||||
void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz)
|
||||
void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz)
|
||||
{
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixSetForceCuda_UpdateNmax(sdata);
|
||||
|
@ -78,18 +78,18 @@ void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLO
|
|||
Cuda_FixSetForceCuda_UpdateBuffer(sdata);
|
||||
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
|
||||
Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
|
||||
|
||||
int oldgrid = grid.x;
|
||||
grid.x = 3;
|
||||
threads.x = 512;
|
||||
Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal);
|
||||
Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
|
||||
|
||||
|
|
|
@ -24,4 +24,4 @@
|
|||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz);
|
||||
extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz);
|
||||
|
|
|
@ -21,10 +21,10 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
extern __shared__ F_FLOAT sharedmem[];
|
||||
extern __shared__ F_CFLOAT sharedmem[];
|
||||
|
||||
|
||||
__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, int flagx, int flagy, int flagz)
|
||||
__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, int flagx, int flagy, int flagz)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
|
@ -48,7 +48,7 @@ __global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
|
|||
reduceBlock(sharedmem);
|
||||
reduceBlock(&sharedmem[blockDim.x]);
|
||||
reduceBlock(&sharedmem[2 * blockDim.x]);
|
||||
F_FLOAT* buffer = (F_FLOAT*)_buffer;
|
||||
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
|
||||
|
@ -58,12 +58,12 @@ __global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
|
|||
}
|
||||
|
||||
|
||||
__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal)
|
||||
__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
|
||||
{
|
||||
int i = 0;
|
||||
sharedmem[threadIdx.x] = 0;
|
||||
F_FLOAT myforig = 0.0;
|
||||
F_FLOAT* buf = (F_FLOAT*)_buffer;
|
||||
F_CFLOAT myforig = 0.0;
|
||||
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
|
||||
buf = &buf[blockIdx.x * n];
|
||||
|
||||
while(i < n) {
|
||||
|
|
|
@ -41,37 +41,37 @@
|
|||
__device__ __constant__ int* _shake_atom;
|
||||
__device__ __constant__ int* _shake_type;
|
||||
__device__ __constant__ int* _shake_flag;
|
||||
__device__ __constant__ X_FLOAT3* _xshake;
|
||||
__device__ __constant__ F_FLOAT _dtfsq;
|
||||
__device__ __constant__ X_FLOAT* _bond_distance;
|
||||
__device__ __constant__ X_FLOAT* _angle_distance;
|
||||
__device__ __constant__ X_CFLOAT3* _xshake;
|
||||
__device__ __constant__ F_CFLOAT _dtfsq;
|
||||
__device__ __constant__ X_CFLOAT* _bond_distance;
|
||||
__device__ __constant__ X_CFLOAT* _angle_distance;
|
||||
__device__ __constant__ int _max_iter;
|
||||
__device__ __constant__ X_FLOAT _tolerance;
|
||||
__device__ __constant__ X_CFLOAT _tolerance;
|
||||
|
||||
#include "fix_shake_cuda_kernel.cu"
|
||||
|
||||
void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata , sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
|
||||
{
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity , sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_CFLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(triclinic) , &sdata->domain.triclinic , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_FLOAT) * 6);
|
||||
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_CFLOAT) * 6);
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
|
||||
|
@ -89,10 +89,10 @@ void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
|
|||
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
|
||||
}
|
||||
|
||||
void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
|
||||
void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
|
||||
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
|
||||
void* bond_distance, void* angle_distance, void* virial,
|
||||
int max_iter, X_FLOAT tolerance)
|
||||
int max_iter, X_CFLOAT tolerance)
|
||||
{
|
||||
Cuda_FixShakeCuda_UpdateNmax(sdata);
|
||||
Cuda_FixShakeCuda_UpdateDomain(sdata);
|
||||
|
@ -100,17 +100,17 @@ void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
|
|||
cudaMemcpyToSymbol(MY_AP(shake_type) , & shake_type , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(shake_flag) , & shake_flag , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(xshake) , & xshake , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(bond_distance) , & bond_distance , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(angle_distance) , & angle_distance , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , & virial , sizeof(void*));
|
||||
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(max_iter) , &max_iter , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_CFLOAT));
|
||||
|
||||
if(sdata->atom.mass_host)
|
||||
cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
|
||||
|
||||
|
@ -149,16 +149,16 @@ void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom,
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_FLOAT), 64);
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_CFLOAT), 64);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->buffer_new)
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_FLOAT));
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_CFLOAT));
|
||||
|
||||
BindXTypeTexture(sdata);
|
||||
|
||||
FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_FLOAT)>>> (vflag, vflag_atom, list, nlist);
|
||||
FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_CFLOAT)>>> (vflag, vflag_atom, list, nlist);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");
|
||||
|
@ -168,7 +168,7 @@ void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom,
|
|||
grid.x = 6;
|
||||
grid.y = 1;
|
||||
threads.x = 256;
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
|
||||
}
|
||||
|
@ -183,14 +183,14 @@ int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void*
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -212,7 +212,7 @@ int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void*
|
|||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz);
|
||||
cudaThreadSynchronize();
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
int aflag;
|
||||
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
|
@ -232,16 +232,16 @@ int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, i
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
|
||||
|
||||
static int count = -1;
|
||||
count++;
|
||||
X_FLOAT dx = 0.0;
|
||||
X_FLOAT dy = 0.0;
|
||||
X_FLOAT dz = 0.0;
|
||||
X_CFLOAT dx = 0.0;
|
||||
X_CFLOAT dy = 0.0;
|
||||
X_CFLOAT dz = 0.0;
|
||||
|
||||
if(pbc_flag != 0) {
|
||||
if(sdata->domain.triclinic == 0) {
|
||||
|
@ -278,7 +278,7 @@ void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, voi
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
int size = n * 3 * sizeof(X_FLOAT);
|
||||
int size = n * 3 * sizeof(X_CFLOAT);
|
||||
|
||||
if(sdata->buffer_new or (size > sdata->buffersize))
|
||||
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
|
||||
|
@ -288,7 +288,7 @@ void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, voi
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
|
||||
FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
|
||||
|
|
|
@ -22,10 +22,10 @@
|
|||
------------------------------------------------------------------------- */
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
|
||||
extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
|
||||
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
|
||||
void* bond_distance, void* angle_distance, void* virial,
|
||||
int max_iter, X_FLOAT tolerance);
|
||||
int max_iter, X_CFLOAT tolerance);
|
||||
extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
|
||||
extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist);
|
||||
extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
|
||||
|
|
|
@ -21,12 +21,12 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_FLOAT total, ENERGY_FLOAT* v)
|
||||
__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_CFLOAT total, ENERGY_CFLOAT* v)
|
||||
{
|
||||
/*if(vflag_global)
|
||||
{
|
||||
ENERGY_FLOAT fraction = n/total;
|
||||
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
|
||||
ENERGY_CFLOAT fraction = n/total;
|
||||
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
|
||||
*shared += fraction*v[0]; shared+=blockDim.x;
|
||||
*shared += fraction*v[1]; shared+=blockDim.x;
|
||||
*shared += fraction*v[2]; shared+=blockDim.x;
|
||||
|
@ -35,11 +35,11 @@ __device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, E
|
|||
*shared += fraction*v[5];
|
||||
}*/
|
||||
if(vflag_atom) {
|
||||
ENERGY_FLOAT fraction = ENERGY_F(1.0) / total;
|
||||
ENERGY_CFLOAT fraction = ENERGY_F(1.0) / total;
|
||||
|
||||
for(int i = 0; i < n; i++) {
|
||||
int m = list[i];
|
||||
ENERGY_FLOAT* myvatom = &_vatom[m];
|
||||
ENERGY_CFLOAT* myvatom = &_vatom[m];
|
||||
|
||||
*myvatom += fraction * v[0];
|
||||
myvatom += _nmax;
|
||||
|
@ -56,7 +56,7 @@ __device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, E
|
|||
}
|
||||
}
|
||||
|
||||
inline __device__ void minimum_image(X_FLOAT3 &delta)
|
||||
inline __device__ void minimum_image(X_CFLOAT3 &delta)
|
||||
{
|
||||
if(_triclinic == 0) {
|
||||
if(_periodicity[0]) {
|
||||
|
@ -106,14 +106,14 @@ __global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
|
|||
|
||||
if(i >= _nlocal) return;
|
||||
|
||||
X_FLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)};
|
||||
X_CFLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)};
|
||||
|
||||
if(_shake_flag[i]) {
|
||||
F_FLOAT* my_f = _f + i;
|
||||
V_FLOAT* my_v = _v + i;
|
||||
X_FLOAT* my_x = _x + i;
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
V_CFLOAT* my_v = _v + i;
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
|
||||
V_FLOAT dtfmsq = _dtfsq;
|
||||
V_CFLOAT dtfmsq = _dtfsq;
|
||||
|
||||
if(_rmass_flag) dtfmsq *= V_F(1.0) / _rmass[i];
|
||||
else dtfmsq *= V_F(1.0) / _mass[_type[i]];
|
||||
|
@ -138,20 +138,20 @@ __global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
|
|||
__device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
|
||||
{
|
||||
int nlist, list[2];
|
||||
ENERGY_FLOAT v[6];
|
||||
X_FLOAT invmass0, invmass1;
|
||||
ENERGY_CFLOAT v[6];
|
||||
X_CFLOAT invmass0, invmass1;
|
||||
|
||||
// local atom IDs and constraint distances
|
||||
|
||||
int i0 = _map_array[_shake_atom[m]];
|
||||
int i1 = _map_array[_shake_atom[m + _nmax]];
|
||||
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
|
||||
// r01 = distance vec between atoms, with PBC
|
||||
|
||||
X_FLOAT3 r01;
|
||||
X_CFLOAT3 r01;
|
||||
|
||||
X_FLOAT4 x_i0, x_i1;
|
||||
X_CFLOAT4 x_i0, x_i1;
|
||||
x_i0 = fetchXType(i0);
|
||||
x_i1 = fetchXType(i1);
|
||||
|
||||
|
@ -162,9 +162,9 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// s01 = distance vec after unconstrained update, with PBC
|
||||
|
||||
X_FLOAT3 s01;
|
||||
X_FLOAT3 xs_i0 = _xshake[i0];
|
||||
X_FLOAT3 xs_i1 = _xshake[i1];
|
||||
X_CFLOAT3 s01;
|
||||
X_CFLOAT3 xs_i0 = _xshake[i0];
|
||||
X_CFLOAT3 xs_i1 = _xshake[i1];
|
||||
|
||||
s01.x = xs_i0.x - xs_i1.x;
|
||||
s01.y = xs_i0.y - xs_i1.y;
|
||||
|
@ -173,8 +173,8 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// scalar distances between atoms
|
||||
|
||||
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
|
||||
// a,b,c = coeffs in quadratic equation for lamda
|
||||
|
||||
|
@ -186,14 +186,14 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
|
|||
invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)];
|
||||
}
|
||||
|
||||
X_FLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_FLOAT b = X_F(2.0) * (invmass0 + invmass1) *
|
||||
X_CFLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_CFLOAT b = X_F(2.0) * (invmass0 + invmass1) *
|
||||
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
|
||||
X_FLOAT c = s01sq - bond1 * bond1;
|
||||
X_CFLOAT c = s01sq - bond1 * bond1;
|
||||
|
||||
// error check
|
||||
|
||||
X_FLOAT determ = b * b - X_F(4.0) * a * c;
|
||||
X_CFLOAT determ = b * b - X_F(4.0) * a * c;
|
||||
|
||||
if(determ < X_F(0.0)) {
|
||||
_flag[0]++;
|
||||
|
@ -202,7 +202,7 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// exact quadratic solution for lamda
|
||||
|
||||
X_FLOAT lamda, lamda1, lamda2;
|
||||
X_CFLOAT lamda, lamda1, lamda2;
|
||||
lamda1 = -b + _SQRT_(determ);
|
||||
lamda2 = -lamda1 - X_F(2.0) * b;
|
||||
lamda1 *= X_F(1.0) / (X_F(2.0) * a);
|
||||
|
@ -233,8 +233,8 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
|
|||
}
|
||||
|
||||
if(vflag || vflag_atom) {
|
||||
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_FLOAT factor = nlist;
|
||||
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_CFLOAT factor = nlist;
|
||||
v[0] = lamda * r01.x * r01.x;
|
||||
*shared = factor * v[0];
|
||||
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
|
||||
|
@ -262,22 +262,22 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
|
|||
__device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
|
||||
{
|
||||
int nlist, list[3];
|
||||
ENERGY_FLOAT v[6];
|
||||
X_FLOAT invmass0, invmass1, invmass2;
|
||||
ENERGY_CFLOAT v[6];
|
||||
X_CFLOAT invmass0, invmass1, invmass2;
|
||||
|
||||
// local atom IDs and constraint distances
|
||||
|
||||
int i0 = _map_array[_shake_atom[m]];
|
||||
int i1 = _map_array[_shake_atom[m + _nmax]];
|
||||
int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
|
||||
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
|
||||
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
|
||||
|
||||
// r01 = distance vec between atoms, with PBC
|
||||
|
||||
X_FLOAT3 r01, r02;
|
||||
X_CFLOAT3 r01, r02;
|
||||
|
||||
X_FLOAT4 x_i0, x_i1, x_i2;
|
||||
X_CFLOAT4 x_i0, x_i1, x_i2;
|
||||
x_i0 = fetchXType(i0);
|
||||
x_i1 = fetchXType(i1);
|
||||
x_i2 = fetchXType(i2);
|
||||
|
@ -294,10 +294,10 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// s01 = distance vec after unconstrained update, with PBC
|
||||
|
||||
X_FLOAT3 s01, s02;
|
||||
X_FLOAT3 xs_i0 = _xshake[i0];
|
||||
X_FLOAT3 xs_i1 = _xshake[i1];
|
||||
X_FLOAT3 xs_i2 = _xshake[i2];
|
||||
X_CFLOAT3 s01, s02;
|
||||
X_CFLOAT3 xs_i0 = _xshake[i0];
|
||||
X_CFLOAT3 xs_i1 = _xshake[i1];
|
||||
X_CFLOAT3 xs_i2 = _xshake[i2];
|
||||
|
||||
s01.x = xs_i0.x - xs_i1.x;
|
||||
s01.y = xs_i0.y - xs_i1.y;
|
||||
|
@ -311,10 +311,10 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// scalar distances between atoms
|
||||
|
||||
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
|
||||
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
|
||||
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
|
||||
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
|
||||
|
||||
// a,b,c = coeffs in quadratic equation for lamda
|
||||
|
||||
|
@ -328,48 +328,48 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
|
|||
invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)];
|
||||
}
|
||||
|
||||
X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
|
||||
X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
|
||||
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
|
||||
X_FLOAT a12 = X_F(2.0) * invmass0 *
|
||||
X_CFLOAT a12 = X_F(2.0) * invmass0 *
|
||||
(s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
|
||||
X_FLOAT a21 = X_F(2.0) * invmass0 *
|
||||
X_CFLOAT a21 = X_F(2.0) * invmass0 *
|
||||
(s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
|
||||
X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
|
||||
X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
|
||||
(s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
|
||||
|
||||
// error check
|
||||
|
||||
X_FLOAT determ = a11 * a22 - a12 * a21;
|
||||
X_CFLOAT determ = a11 * a22 - a12 * a21;
|
||||
|
||||
if(determ == X_F(0.0)) _flag[0]++;
|
||||
|
||||
X_FLOAT determinv = X_F(1.0) / determ;
|
||||
X_CFLOAT determinv = X_F(1.0) / determ;
|
||||
|
||||
X_FLOAT a11inv = a22 * determinv;
|
||||
X_FLOAT a12inv = -a12 * determinv;
|
||||
X_FLOAT a21inv = -a21 * determinv;
|
||||
X_FLOAT a22inv = a11 * determinv;
|
||||
X_CFLOAT a11inv = a22 * determinv;
|
||||
X_CFLOAT a12inv = -a12 * determinv;
|
||||
X_CFLOAT a21inv = -a21 * determinv;
|
||||
X_CFLOAT a22inv = a11 * determinv;
|
||||
|
||||
// quadratic correction coeffs
|
||||
|
||||
X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
|
||||
X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
|
||||
|
||||
X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
|
||||
X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
|
||||
|
||||
X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
|
||||
X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
|
||||
X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
|
||||
X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
|
||||
|
||||
// iterate until converged
|
||||
|
||||
X_FLOAT lamda01 = X_F(0.0);
|
||||
X_FLOAT lamda02 = X_F(0.0);
|
||||
X_CFLOAT lamda01 = X_F(0.0);
|
||||
X_CFLOAT lamda02 = X_F(0.0);
|
||||
int niter = 0;
|
||||
int done = 0;
|
||||
|
||||
X_FLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new;
|
||||
X_CFLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new;
|
||||
|
||||
//maybe all running full loop?
|
||||
while(__any(!done) && niter < _max_iter) {
|
||||
|
@ -425,8 +425,8 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
|
|||
}
|
||||
|
||||
if(vflag || vflag_atom) {
|
||||
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
|
||||
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_CFLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
|
||||
v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x;
|
||||
*shared = factor * v[0];
|
||||
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
|
||||
|
@ -453,8 +453,8 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
|
|||
__device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
|
||||
{
|
||||
int nlist, list[4];
|
||||
ENERGY_FLOAT v[6];
|
||||
X_FLOAT invmass0, invmass1, invmass2, invmass3;
|
||||
ENERGY_CFLOAT v[6];
|
||||
X_CFLOAT invmass0, invmass1, invmass2, invmass3;
|
||||
|
||||
// local atom IDs and constraint distances
|
||||
|
||||
|
@ -462,15 +462,15 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
|
|||
int i1 = _map_array[_shake_atom[m + _nmax]];
|
||||
int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
|
||||
int i3 = _map_array[_shake_atom[m + 3 * _nmax]];
|
||||
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
|
||||
X_FLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]];
|
||||
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
|
||||
X_CFLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]];
|
||||
|
||||
// r01 = distance vec between atoms, with PBC
|
||||
|
||||
X_FLOAT3 r01, r02, r03;
|
||||
X_CFLOAT3 r01, r02, r03;
|
||||
|
||||
X_FLOAT4 x_i0, x_i1, x_i2, x_i3;
|
||||
X_CFLOAT4 x_i0, x_i1, x_i2, x_i3;
|
||||
x_i0 = fetchXType(i0);
|
||||
x_i1 = fetchXType(i1);
|
||||
x_i2 = fetchXType(i2);
|
||||
|
@ -493,11 +493,11 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// s01 = distance vec after unconstrained update, with PBC
|
||||
|
||||
X_FLOAT3 s01, s02, s03;
|
||||
X_FLOAT3 xs_i0 = _xshake[i0];
|
||||
X_FLOAT3 xs_i1 = _xshake[i1];
|
||||
X_FLOAT3 xs_i2 = _xshake[i2];
|
||||
X_FLOAT3 xs_i3 = _xshake[i3];
|
||||
X_CFLOAT3 s01, s02, s03;
|
||||
X_CFLOAT3 xs_i0 = _xshake[i0];
|
||||
X_CFLOAT3 xs_i1 = _xshake[i1];
|
||||
X_CFLOAT3 xs_i2 = _xshake[i2];
|
||||
X_CFLOAT3 xs_i3 = _xshake[i3];
|
||||
|
||||
s01.x = xs_i0.x - xs_i1.x;
|
||||
s01.y = xs_i0.y - xs_i1.y;
|
||||
|
@ -516,12 +516,12 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// scalar distances between atoms
|
||||
|
||||
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
|
||||
X_FLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z;
|
||||
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
|
||||
X_FLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z;
|
||||
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
|
||||
X_CFLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z;
|
||||
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
|
||||
X_CFLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z;
|
||||
|
||||
// a,b,c = coeffs in quadratic equation for lamda
|
||||
|
||||
|
@ -537,79 +537,79 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
|
|||
invmass3 = X_F(1.0) / _mass[static_cast <int>(x_i3.w)];
|
||||
}
|
||||
|
||||
X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
|
||||
X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
|
||||
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
|
||||
X_FLOAT a12 = X_F(2.0) * invmass0 *
|
||||
X_CFLOAT a12 = X_F(2.0) * invmass0 *
|
||||
(s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
|
||||
X_FLOAT a13 = X_F(2.0) * invmass0 *
|
||||
X_CFLOAT a13 = X_F(2.0) * invmass0 *
|
||||
(s01.x * r03.x + s01.y * r03.y + s01.z * r03.z);
|
||||
X_FLOAT a21 = X_F(2.0) * invmass0 *
|
||||
X_CFLOAT a21 = X_F(2.0) * invmass0 *
|
||||
(s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
|
||||
X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
|
||||
X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
|
||||
(s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
|
||||
X_FLOAT a23 = X_F(2.0) * (invmass0) *
|
||||
X_CFLOAT a23 = X_F(2.0) * (invmass0) *
|
||||
(s02.x * r03.x + s02.y * r03.y + s02.z * r03.z);
|
||||
X_FLOAT a31 = X_F(2.0) * (invmass0) *
|
||||
X_CFLOAT a31 = X_F(2.0) * (invmass0) *
|
||||
(s03.x * r01.x + s03.y * r01.y + s03.z * r01.z);
|
||||
X_FLOAT a32 = X_F(2.0) * (invmass0) *
|
||||
X_CFLOAT a32 = X_F(2.0) * (invmass0) *
|
||||
(s03.x * r02.x + s03.y * r02.y + s03.z * r02.z);
|
||||
X_FLOAT a33 = X_F(2.0) * (invmass0 + invmass3) *
|
||||
X_CFLOAT a33 = X_F(2.0) * (invmass0 + invmass3) *
|
||||
(s03.x * r03.x + s03.y * r03.y + s03.z * r03.z);
|
||||
|
||||
// error check
|
||||
|
||||
X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
|
||||
X_CFLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
|
||||
a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
|
||||
|
||||
if(determ == X_F(0.0)) _flag[0]++;
|
||||
|
||||
X_FLOAT determinv = X_F(1.0) / determ;
|
||||
X_CFLOAT determinv = X_F(1.0) / determ;
|
||||
|
||||
X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
|
||||
X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
|
||||
X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
|
||||
X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
|
||||
X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
|
||||
X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
|
||||
X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
|
||||
X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
|
||||
X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
|
||||
X_CFLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
|
||||
X_CFLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
|
||||
X_CFLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
|
||||
X_CFLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
|
||||
X_CFLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
|
||||
X_CFLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
|
||||
X_CFLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
|
||||
X_CFLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
|
||||
X_CFLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
|
||||
|
||||
// quadratic correction coeffs
|
||||
|
||||
X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
|
||||
X_FLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z);
|
||||
X_FLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z);
|
||||
X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
|
||||
X_CFLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z);
|
||||
X_CFLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z);
|
||||
|
||||
X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_FLOAT quad1_0303 = invmass0 * invmass0 * r03sq;
|
||||
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
|
||||
X_FLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103;
|
||||
X_FLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203;
|
||||
X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_CFLOAT quad1_0303 = invmass0 * invmass0 * r03sq;
|
||||
X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
|
||||
X_CFLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103;
|
||||
X_CFLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203;
|
||||
|
||||
X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
|
||||
X_FLOAT quad2_0303 = invmass0 * invmass0 * r03sq;
|
||||
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
|
||||
X_FLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103;
|
||||
X_FLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203;
|
||||
X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
|
||||
X_CFLOAT quad2_0303 = invmass0 * invmass0 * r03sq;
|
||||
X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
|
||||
X_CFLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103;
|
||||
X_CFLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203;
|
||||
|
||||
X_FLOAT quad3_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_FLOAT quad3_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_FLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq;
|
||||
X_FLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102;
|
||||
X_FLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103;
|
||||
X_FLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203;
|
||||
X_CFLOAT quad3_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_CFLOAT quad3_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_CFLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq;
|
||||
X_CFLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102;
|
||||
X_CFLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103;
|
||||
X_CFLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203;
|
||||
// iterate until converged
|
||||
|
||||
X_FLOAT lamda01 = X_F(0.0);
|
||||
X_FLOAT lamda02 = X_F(0.0);
|
||||
X_FLOAT lamda03 = X_F(0.0);
|
||||
X_CFLOAT lamda01 = X_F(0.0);
|
||||
X_CFLOAT lamda02 = X_F(0.0);
|
||||
X_CFLOAT lamda03 = X_F(0.0);
|
||||
int niter = 0;
|
||||
int done = 0;
|
||||
|
||||
X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new;
|
||||
X_CFLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new;
|
||||
|
||||
//maybe all running full loop?
|
||||
while(__any(!done) && niter < _max_iter) {
|
||||
|
@ -692,8 +692,8 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
|
|||
}
|
||||
|
||||
if(vflag || vflag_atom) {
|
||||
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_FLOAT factor = X_F(2.0) / X_F(4.0) * nlist;
|
||||
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_CFLOAT factor = X_F(2.0) / X_F(4.0) * nlist;
|
||||
v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda03 * r03.x * r03.x;
|
||||
*shared = factor * v[0];
|
||||
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
|
||||
|
@ -720,23 +720,23 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
|
|||
__device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
|
||||
{
|
||||
int nlist, list[3];
|
||||
ENERGY_FLOAT v[6];
|
||||
X_FLOAT invmass0, invmass1, invmass2;
|
||||
ENERGY_CFLOAT v[6];
|
||||
X_CFLOAT invmass0, invmass1, invmass2;
|
||||
|
||||
// local atom IDs and constraint distances
|
||||
|
||||
int i0 = _map_array[_shake_atom[m]];
|
||||
int i1 = _map_array[_shake_atom[m + _nmax]];
|
||||
int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
|
||||
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
|
||||
X_FLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]];
|
||||
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
|
||||
X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
|
||||
X_CFLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]];
|
||||
|
||||
// r01 = distance vec between atoms, with PBC
|
||||
|
||||
X_FLOAT3 r01, r02, r12;
|
||||
X_CFLOAT3 r01, r02, r12;
|
||||
|
||||
X_FLOAT4 x_i0, x_i1, x_i2;
|
||||
X_CFLOAT4 x_i0, x_i1, x_i2;
|
||||
x_i0 = fetchXType(i0);
|
||||
x_i1 = fetchXType(i1);
|
||||
x_i2 = fetchXType(i2);
|
||||
|
@ -758,10 +758,10 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// s01 = distance vec after unconstrained update, with PBC
|
||||
|
||||
X_FLOAT3 s01, s02, s12;
|
||||
X_FLOAT3 xs_i0 = _xshake[i0];
|
||||
X_FLOAT3 xs_i1 = _xshake[i1];
|
||||
X_FLOAT3 xs_i2 = _xshake[i2];
|
||||
X_CFLOAT3 s01, s02, s12;
|
||||
X_CFLOAT3 xs_i0 = _xshake[i0];
|
||||
X_CFLOAT3 xs_i1 = _xshake[i1];
|
||||
X_CFLOAT3 xs_i2 = _xshake[i2];
|
||||
|
||||
s01.x = xs_i0.x - xs_i1.x;
|
||||
s01.y = xs_i0.y - xs_i1.y;
|
||||
|
@ -780,12 +780,12 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
|
|||
|
||||
// scalar distances between atoms
|
||||
|
||||
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
|
||||
X_FLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z;
|
||||
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
|
||||
X_FLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z;
|
||||
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
|
||||
X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
|
||||
X_CFLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z;
|
||||
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
|
||||
X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
|
||||
X_CFLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z;
|
||||
|
||||
// a,b,c = coeffs in quadratic equation for lamda
|
||||
|
||||
|
@ -799,79 +799,79 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
|
|||
invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)];
|
||||
}
|
||||
|
||||
X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
|
||||
X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
|
||||
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
|
||||
X_FLOAT a12 = X_F(2.0) * invmass0 *
|
||||
X_CFLOAT a12 = X_F(2.0) * invmass0 *
|
||||
(s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
|
||||
X_FLOAT a13 = - X_F(2.0) * invmass1 *
|
||||
X_CFLOAT a13 = - X_F(2.0) * invmass1 *
|
||||
(s01.x * r12.x + s01.y * r12.y + s01.z * r12.z);
|
||||
X_FLOAT a21 = X_F(2.0) * invmass0 *
|
||||
X_CFLOAT a21 = X_F(2.0) * invmass0 *
|
||||
(s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
|
||||
X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
|
||||
X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
|
||||
(s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
|
||||
X_FLOAT a23 = X_F(2.0) * invmass2 *
|
||||
X_CFLOAT a23 = X_F(2.0) * invmass2 *
|
||||
(s02.x * r12.x + s02.y * r12.y + s02.z * r12.z);
|
||||
X_FLOAT a31 = - X_F(2.0) * invmass1 *
|
||||
X_CFLOAT a31 = - X_F(2.0) * invmass1 *
|
||||
(s12.x * r01.x + s12.y * r01.y + s12.z * r01.z);
|
||||
X_FLOAT a32 = X_F(2.0) * invmass2 *
|
||||
X_CFLOAT a32 = X_F(2.0) * invmass2 *
|
||||
(s12.x * r02.x + s12.y * r02.y + s12.z * r02.z);
|
||||
X_FLOAT a33 = X_F(2.0) * (invmass1 + invmass2) *
|
||||
X_CFLOAT a33 = X_F(2.0) * (invmass1 + invmass2) *
|
||||
(s12.x * r12.x + s12.y * r12.y + s12.z * r12.z);
|
||||
|
||||
// inverse of matrix
|
||||
|
||||
X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
|
||||
X_CFLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
|
||||
a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
|
||||
|
||||
if(determ == X_F(0.0)) _flag[0]++;
|
||||
|
||||
X_FLOAT determinv = X_F(1.0) / determ;
|
||||
X_CFLOAT determinv = X_F(1.0) / determ;
|
||||
|
||||
X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
|
||||
X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
|
||||
X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
|
||||
X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
|
||||
X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
|
||||
X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
|
||||
X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
|
||||
X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
|
||||
X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
|
||||
X_CFLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
|
||||
X_CFLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
|
||||
X_CFLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
|
||||
X_CFLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
|
||||
X_CFLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
|
||||
X_CFLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
|
||||
X_CFLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
|
||||
X_CFLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
|
||||
X_CFLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
|
||||
|
||||
// quadratic correction coeffs
|
||||
|
||||
X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
|
||||
X_FLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z);
|
||||
X_FLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z);
|
||||
X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
|
||||
X_CFLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z);
|
||||
X_CFLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z);
|
||||
|
||||
X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_FLOAT quad1_1212 = invmass1 * invmass1 * r12sq;
|
||||
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
|
||||
X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112;
|
||||
X_FLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212;
|
||||
X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
|
||||
X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
|
||||
X_CFLOAT quad1_1212 = invmass1 * invmass1 * r12sq;
|
||||
X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
|
||||
X_CFLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112;
|
||||
X_CFLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212;
|
||||
|
||||
X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
|
||||
X_FLOAT quad2_1212 = invmass2 * invmass2 * r12sq;
|
||||
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
|
||||
X_FLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112;
|
||||
X_FLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212;
|
||||
X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
|
||||
X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
|
||||
X_CFLOAT quad2_1212 = invmass2 * invmass2 * r12sq;
|
||||
X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
|
||||
X_CFLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112;
|
||||
X_CFLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212;
|
||||
|
||||
X_FLOAT quad3_0101 = invmass1 * invmass1 * r01sq;
|
||||
X_FLOAT quad3_0202 = invmass2 * invmass2 * r02sq;
|
||||
X_FLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq;
|
||||
X_FLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102;
|
||||
X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112;
|
||||
X_FLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212;
|
||||
X_CFLOAT quad3_0101 = invmass1 * invmass1 * r01sq;
|
||||
X_CFLOAT quad3_0202 = invmass2 * invmass2 * r02sq;
|
||||
X_CFLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq;
|
||||
X_CFLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102;
|
||||
X_CFLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112;
|
||||
X_CFLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212;
|
||||
// iterate until converged
|
||||
|
||||
X_FLOAT lamda01 = X_F(0.0);
|
||||
X_FLOAT lamda02 = X_F(0.0);
|
||||
X_FLOAT lamda12 = X_F(0.0);
|
||||
X_CFLOAT lamda01 = X_F(0.0);
|
||||
X_CFLOAT lamda02 = X_F(0.0);
|
||||
X_CFLOAT lamda12 = X_F(0.0);
|
||||
int niter = 0;
|
||||
int done = 0;
|
||||
|
||||
X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new;
|
||||
X_CFLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new;
|
||||
|
||||
//maybe all running full loop?
|
||||
while(__any(!done) && niter < _max_iter) {
|
||||
|
@ -947,8 +947,8 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
|
|||
}
|
||||
|
||||
if(vflag || vflag_atom) {
|
||||
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
|
||||
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
|
||||
X_CFLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
|
||||
v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda12 * r12.x * r12.x;
|
||||
*shared = factor * v[0];
|
||||
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
|
||||
|
@ -986,7 +986,7 @@ __global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list,
|
|||
else if(sflag == 4) FixShakeCuda_Shake4(vflag, vflag_atom, m);
|
||||
else FixShakeCuda_Shake3Angle(vflag, vflag_atom, m);
|
||||
} else {
|
||||
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
|
||||
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
|
||||
*shared = ENERGY_F(0.0);
|
||||
shared += blockDim.x;
|
||||
*shared = ENERGY_F(0.0);
|
||||
|
@ -1008,7 +1008,7 @@ __global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list,
|
|||
|
||||
}
|
||||
|
||||
__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz)
|
||||
__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -1018,15 +1018,15 @@ __global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistle
|
|||
|
||||
if(j > _nmax) _flag[0] = 1;
|
||||
|
||||
X_FLOAT3 xs = _xshake[j];
|
||||
((X_FLOAT*) _buffer)[i] = xs.x + dx;
|
||||
((X_FLOAT*) _buffer)[i + 1 * n] = xs.y + dy;
|
||||
((X_FLOAT*) _buffer)[i + 2 * n] = xs.z + dz;
|
||||
X_CFLOAT3 xs = _xshake[j];
|
||||
((X_CFLOAT*) _buffer)[i] = xs.x + dx;
|
||||
((X_CFLOAT*) _buffer)[i + 1 * n] = xs.y + dy;
|
||||
((X_CFLOAT*) _buffer)[i + 2 * n] = xs.z + dz;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
|
||||
__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -1036,7 +1036,7 @@ __global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxl
|
|||
|
||||
if(j > _nmax) _flag[0] = 1;
|
||||
|
||||
X_FLOAT3 xs = _xshake[j];
|
||||
X_CFLOAT3 xs = _xshake[j];
|
||||
xs.x += dx;
|
||||
xs.y += dy;
|
||||
xs.z += dz;
|
||||
|
@ -1050,10 +1050,10 @@ __global__ void FixShakeCuda_UnpackComm_Kernel(int n, int first)
|
|||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < n) {
|
||||
X_FLOAT3 xs;
|
||||
xs.x = ((X_FLOAT*) _buffer)[i];
|
||||
xs.y = ((X_FLOAT*) _buffer)[i + 1 * n];
|
||||
xs.z = ((X_FLOAT*) _buffer)[i + 2 * n];
|
||||
X_CFLOAT3 xs;
|
||||
xs.x = ((X_CFLOAT*) _buffer)[i];
|
||||
xs.y = ((X_CFLOAT*) _buffer)[i + 1 * n];
|
||||
xs.z = ((X_CFLOAT*) _buffer)[i + 2 * n];
|
||||
_xshake[i + first] = xs;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
|
||||
|
@ -48,7 +48,7 @@ void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
|
|||
|
||||
void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
|
||||
{
|
||||
V_FLOAT factor = afactor;
|
||||
V_CFLOAT factor = afactor;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
|
||||
|
||||
__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor)
|
||||
__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
|
||||
|
@ -48,7 +48,7 @@ void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
|
|||
|
||||
void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
|
||||
{
|
||||
V_FLOAT factor = afactor;
|
||||
V_CFLOAT factor = afactor;
|
||||
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
|
||||
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
|
||||
|
||||
__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor)
|
||||
__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
|
||||
}
|
||||
|
||||
void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
|
||||
|
@ -48,7 +48,7 @@ void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
|
|||
|
||||
void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit)
|
||||
{
|
||||
V_FLOAT factor = afactor;
|
||||
V_CFLOAT factor = afactor;
|
||||
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
|
||||
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
|
||||
//if(sdata->atom.update_nlocal)
|
||||
|
|
|
@ -23,15 +23,15 @@
|
|||
|
||||
|
||||
|
||||
__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor, V_FLOAT limit)
|
||||
__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor, V_CFLOAT limit)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
V_FLOAT vx = _v[i];
|
||||
V_FLOAT vy = _v[i + _nmax];
|
||||
V_FLOAT vz = _v[i + 2 * _nmax];
|
||||
V_CFLOAT vx = _v[i];
|
||||
V_CFLOAT vy = _v[i + _nmax];
|
||||
V_CFLOAT vz = _v[i + 2 * _nmax];
|
||||
vx *= factor;
|
||||
vy *= factor;
|
||||
vz *= factor;
|
||||
|
|
|
@ -35,8 +35,8 @@ void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
|
|||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
}
|
||||
|
||||
|
@ -60,7 +60,7 @@ void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void*
|
|||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_FLOAT*) gamma);
|
||||
Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_CFLOAT*) gamma);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
|
||||
|
||||
|
|
|
@ -21,13 +21,13 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_FLOAT* gamma)
|
||||
__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_CFLOAT* gamma)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
if(i < _nlocal)
|
||||
if(_mask[i] & groupbit) {
|
||||
F_FLOAT drag = gamma[_type[i]];
|
||||
F_CFLOAT drag = gamma[_type[i]];
|
||||
_f[i] -= drag * _v[i];
|
||||
_f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax];
|
||||
_f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax];
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
#define _nex_group MY_AP(nex_group)
|
||||
#define _ex_mol_bit MY_AP(ex_mol_bit)
|
||||
#define _nex_mol MY_AP(nex_mol)
|
||||
__device__ __constant__ CUDA_FLOAT* _cutneighsq;
|
||||
__device__ __constant__ CUDA_CFLOAT* _cutneighsq;
|
||||
__device__ __constant__ int* _ex_type;
|
||||
__device__ __constant__ int _nex_type;
|
||||
__device__ __constant__ int* _ex1_bit;
|
||||
|
@ -54,7 +54,7 @@ void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist*
|
|||
{
|
||||
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
|
||||
|
||||
int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_FLOAT)));
|
||||
int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT)));
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -77,7 +77,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
|||
Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);
|
||||
|
||||
// initialize only on first call
|
||||
CUDA_FLOAT rez_bin_size[3] = {
|
||||
CUDA_CFLOAT rez_bin_size[3] = {
|
||||
(1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
|
||||
(1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
|
||||
(1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
|
||||
|
@ -87,10 +87,10 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
|||
|
||||
if(! init) {
|
||||
init = 0;
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_CFLOAT) * 3);
|
||||
}
|
||||
|
||||
|
||||
|
@ -101,7 +101,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
|||
my_times starttime, endtime;
|
||||
my_gettime(CLOCK_REALTIME, &starttime);
|
||||
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
|
||||
|
||||
Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
|
||||
cudaThreadSynchronize();
|
||||
|
@ -126,7 +126,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
|||
int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
//Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
|
||||
CUDA_FLOAT globcutoff = -1.0;
|
||||
CUDA_CFLOAT globcutoff = -1.0;
|
||||
|
||||
short init = 0;
|
||||
|
||||
|
@ -137,11 +137,11 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
|
||||
unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
|
||||
CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
|
||||
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
|
||||
//printf("Allocate: %i\n",nx);
|
||||
sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx);
|
||||
sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx);
|
||||
|
||||
if(sneighlist->cutneighsq) {
|
||||
int cutoffsdiffer = 0;
|
||||
|
@ -149,13 +149,13 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
|
||||
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
|
||||
|
||||
if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
|
||||
}
|
||||
}
|
||||
|
||||
if(not cutoffsdiffer) globcutoff = (CUDA_FLOAT) cutoff0;
|
||||
if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0;
|
||||
} else {
|
||||
MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
|
||||
return 0;
|
||||
|
@ -173,7 +173,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
}
|
||||
|
||||
CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
|
||||
cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_CFLOAT*));
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(special_flag) , sdata->atom.special_flag , 4 * sizeof(int));
|
||||
|
@ -218,14 +218,14 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
|
||||
dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);
|
||||
|
||||
//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax);
|
||||
//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax);
|
||||
int buffer[20];
|
||||
buffer[0] = 1;
|
||||
buffer[1] = 0;
|
||||
CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
|
||||
//cudaMemset(sdata->debugdata,0,100*sizeof(int));
|
||||
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x;
|
||||
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x;
|
||||
MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
|
||||
//shared_size=2056;
|
||||
my_times starttime, endtime;
|
||||
|
@ -245,7 +245,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
|
||||
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
|
||||
}
|
||||
//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>>
|
||||
//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_CFLOAT))*threads.x+sizeof(int)>>>
|
||||
// (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
@ -301,13 +301,13 @@ int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
|
||||
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
|
||||
|
||||
unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
|
||||
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
|
||||
|
||||
if(sneighlist->cutneighsq) {
|
||||
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
|
||||
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
|
||||
acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
|
||||
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
|
||||
//printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
|
||||
}
|
||||
}
|
||||
|
@ -339,7 +339,7 @@ int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
|
||||
|
||||
free(acutneighsq);
|
||||
|
|
|
@ -24,26 +24,26 @@
|
|||
#define SBBITS 30
|
||||
|
||||
__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
|
||||
CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z)
|
||||
CUDA_CFLOAT rez_bin_size_x, CUDA_CFLOAT rez_bin_size_y, CUDA_CFLOAT rez_bin_size_z)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
/*int* bin_count=(int*) _buffer;
|
||||
bin_count=bin_count+20;
|
||||
CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
|
||||
CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
|
||||
CUDA_CFLOAT* binned_x=(CUDA_CFLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
|
||||
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
|
||||
binned_x = &binned_x[2];
|
||||
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
|
||||
|
||||
if(i < _nall) {
|
||||
// copy atom position from global device memory to local register
|
||||
// in this 3 steps to get as much coalesced access as possible
|
||||
X_FLOAT* my_x = _x + i;
|
||||
CUDA_FLOAT x_i = *my_x;
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
CUDA_CFLOAT x_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_FLOAT y_i = *my_x;
|
||||
CUDA_CFLOAT y_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_FLOAT z_i = *my_x;
|
||||
CUDA_CFLOAT z_i = *my_x;
|
||||
|
||||
|
||||
// calculate flat bin index
|
||||
|
@ -102,7 +102,7 @@ __device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern __shared__ CUDA_FLOAT shared[];
|
||||
extern __shared__ CUDA_CFLOAT shared[];
|
||||
|
||||
__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
|
||||
{
|
||||
|
@ -114,12 +114,12 @@ __device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
|
|||
}
|
||||
|
||||
template <const unsigned int exclude>
|
||||
__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall)
|
||||
__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style, bool neighall)
|
||||
{
|
||||
int natoms = neighall ? _nall : _nlocal;
|
||||
//const bool domol=false;
|
||||
int bin_dim_z = gridDim.y;
|
||||
CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
|
||||
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
|
||||
binned_x = &binned_x[2];
|
||||
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
|
||||
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
|
||||
|
@ -129,19 +129,19 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi
|
|||
int bin_c = bin_count[bin];
|
||||
|
||||
|
||||
CUDA_FLOAT cut;
|
||||
CUDA_CFLOAT cut;
|
||||
|
||||
if(globcutoff > 0)
|
||||
cut = globcutoff;
|
||||
|
||||
int i = _nall;
|
||||
CUDA_FLOAT* my_x;
|
||||
CUDA_FLOAT x_i, y_i, z_i;
|
||||
CUDA_CFLOAT* my_x;
|
||||
CUDA_CFLOAT x_i, y_i, z_i;
|
||||
|
||||
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
|
||||
|
||||
int actIdx = threadIdx.x + actOffset;
|
||||
CUDA_FLOAT* other_x = shared;
|
||||
CUDA_CFLOAT* other_x = shared;
|
||||
int* other_id = (int*) &other_x[3 * blockDim.x];
|
||||
|
||||
if(actIdx < bin_c) {
|
||||
|
@ -206,10 +206,10 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi
|
|||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_FLOAT delx = x_i - other_x[kk];
|
||||
CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
|
||||
CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
|
||||
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
CUDA_CFLOAT delx = x_i - other_x[kk];
|
||||
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
|
@ -268,10 +268,10 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi
|
|||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_FLOAT delx = x_i - other_x[k];
|
||||
CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
|
||||
CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
|
||||
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
CUDA_CFLOAT delx = x_i - other_x[k];
|
||||
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
if(jnum < _maxneighbors) {
|
||||
|
@ -378,10 +378,10 @@ __global__ void FindSpecial(int block_style)
|
|||
_numneigh[i] = jnum;
|
||||
}
|
||||
|
||||
__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style)
|
||||
__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style)
|
||||
{
|
||||
int bin_dim_z = gridDim.y;
|
||||
CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
|
||||
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
|
||||
binned_x = &binned_x[2];
|
||||
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
|
||||
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
|
||||
|
@ -391,19 +391,19 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_
|
|||
int bin_c = bin_count[bin];
|
||||
|
||||
|
||||
CUDA_FLOAT cut;
|
||||
CUDA_CFLOAT cut;
|
||||
|
||||
if(globcutoff > 0)
|
||||
cut = globcutoff;
|
||||
|
||||
int i = _nall;
|
||||
CUDA_FLOAT* my_x;
|
||||
CUDA_FLOAT x_i, y_i, z_i;
|
||||
CUDA_CFLOAT* my_x;
|
||||
CUDA_CFLOAT x_i, y_i, z_i;
|
||||
|
||||
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
|
||||
|
||||
int actIdx = threadIdx.x + actOffset;
|
||||
CUDA_FLOAT* other_x = shared;
|
||||
CUDA_CFLOAT* other_x = shared;
|
||||
int* other_id = (int*) &other_x[3 * blockDim.x];
|
||||
|
||||
if(actIdx < bin_c) {
|
||||
|
@ -469,10 +469,10 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_
|
|||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_FLOAT delx = x_i - other_x[kk];
|
||||
CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
|
||||
CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
|
||||
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
CUDA_CFLOAT delx = x_i - other_x[kk];
|
||||
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
|
@ -549,10 +549,10 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_
|
|||
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
|
||||
}
|
||||
|
||||
CUDA_FLOAT delx = x_i - other_x[k];
|
||||
CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
|
||||
CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
|
||||
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
CUDA_CFLOAT delx = x_i - other_x[k];
|
||||
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
|
||||
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq <= cut && i != j) {
|
||||
if((j >= _nlocal) && (i_border < 0))
|
||||
|
@ -612,12 +612,12 @@ __global__ void NeighborBuildFullNsq_Kernel()
|
|||
int* buffer = (int*) _buffer;
|
||||
|
||||
if(i < _nlocal) {
|
||||
X_FLOAT* my_x = _x + i;
|
||||
CUDA_FLOAT x_i = *my_x;
|
||||
X_CFLOAT* my_x = _x + i;
|
||||
CUDA_CFLOAT x_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_FLOAT y_i = *my_x;
|
||||
CUDA_CFLOAT y_i = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_FLOAT z_i = *my_x;
|
||||
CUDA_CFLOAT z_i = *my_x;
|
||||
int jnum = 0;
|
||||
int* jlist = _firstneigh[i];
|
||||
_ilist[i] = i;
|
||||
|
@ -627,15 +627,15 @@ __global__ void NeighborBuildFullNsq_Kernel()
|
|||
|
||||
for(int j = 0; j < _nall; ++j) {
|
||||
my_x = _x + j;
|
||||
CUDA_FLOAT x_j = *my_x;
|
||||
CUDA_CFLOAT x_j = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_FLOAT y_j = *my_x;
|
||||
CUDA_CFLOAT y_j = *my_x;
|
||||
my_x += _nmax;
|
||||
CUDA_FLOAT z_j = *my_x;
|
||||
CUDA_FLOAT delx = x_i - x_j;
|
||||
CUDA_FLOAT dely = y_i - y_j;
|
||||
CUDA_FLOAT delz = z_i - z_j;
|
||||
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
CUDA_CFLOAT z_j = *my_x;
|
||||
CUDA_CFLOAT delx = x_i - x_j;
|
||||
CUDA_CFLOAT dely = y_i - y_j;
|
||||
CUDA_CFLOAT delz = z_i - z_j;
|
||||
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
int jtype = _type[j];
|
||||
|
||||
if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
|
||||
|
|
|
@ -60,10 +60,10 @@ void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BORN, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BORN, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -20,13 +20,13 @@
|
|||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
|
||||
__device__ inline F_CFLOAT PairBornCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r = _RSQRT_(r2inv);
|
||||
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_FLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
|
||||
const F_FLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r = _RSQRT_(r2inv);
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_CFLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
|
||||
const F_CFLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
|
||||
F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv;
|
||||
|
||||
if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv
|
||||
|
|
|
@ -58,10 +58,10 @@ void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -59,10 +59,10 @@ void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -60,10 +60,10 @@ void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -20,13 +20,13 @@
|
|||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
|
||||
__device__ inline F_CFLOAT PairBuckCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_FLOAT r = _RSQRT_(r2inv);
|
||||
const F_FLOAT rexp = _EXP_(-r * _rhoinv[ij_type]);
|
||||
const F_FLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_CFLOAT r = _RSQRT_(r2inv);
|
||||
const F_CFLOAT rexp = _EXP_(-r * _rhoinv[ij_type]);
|
||||
const F_CFLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv;
|
||||
|
||||
if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv -
|
||||
_offset[ij_type]);
|
||||
|
|
|
@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -71,10 +71,10 @@ void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -21,28 +21,28 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4
|
||||
__device__ inline F_CFLOAT PairCGCMMCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) //0.11 of 0.4
|
||||
{
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const int cg_type = _cg_type[ij_type];
|
||||
const F_FLOAT r4inv = r2inv * r2inv;
|
||||
const F_FLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq);
|
||||
const F_FLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0);
|
||||
const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second);
|
||||
const F_CFLOAT r4inv = r2inv * r2inv;
|
||||
const F_CFLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq);
|
||||
const F_CFLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0);
|
||||
const F_CFLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second);
|
||||
|
||||
if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]);
|
||||
|
||||
return factor_lj * forcelj * r2inv;
|
||||
}
|
||||
|
||||
/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
|
||||
/*__device__ inline F_CFLOAT PairCGCMMCuda_Eval(const F_CFLOAT& rsq,const int ij_type,F_CFLOAT& factor_lj,int& eflag, ENERGY_CFLOAT& evdwl)
|
||||
{
|
||||
const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type);
|
||||
const F_FLOAT r2inv = F_F(1.0)/rsq;
|
||||
const F_FLOAT r4inv = r2inv*r2inv;
|
||||
const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
|
||||
const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0);
|
||||
const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second);
|
||||
const F_CFLOAT r2inv = F_F(1.0)/rsq;
|
||||
const F_CFLOAT r4inv = r2inv*r2inv;
|
||||
const F_CFLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
|
||||
const F_CFLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0);
|
||||
const F_CFLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second);
|
||||
|
||||
if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second));
|
||||
return factor_lj*forcelj*r2inv;
|
||||
|
|
|
@ -39,18 +39,18 @@
|
|||
#define _rho MY_AP(rho)
|
||||
#define _fp MY_AP(fp)
|
||||
|
||||
__device__ __constant__ F_FLOAT MY_AP(rdr);
|
||||
__device__ __constant__ F_FLOAT MY_AP(rdrho);
|
||||
__device__ __constant__ F_CFLOAT MY_AP(rdr);
|
||||
__device__ __constant__ F_CFLOAT MY_AP(rdrho);
|
||||
__device__ __constant__ int MY_AP(nr);
|
||||
__device__ __constant__ int MY_AP(nrho);
|
||||
__device__ __constant__ int MY_AP(nfrho);
|
||||
__device__ __constant__ int MY_AP(nrhor);
|
||||
__device__ __constant__ int MY_AP(nz2r);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(frho_spline);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(rhor_spline);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(z2r_spline);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(rho);
|
||||
__device__ __constant__ F_FLOAT* MY_AP(fp);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(frho_spline);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(rhor_spline);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(z2r_spline);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(rho);
|
||||
__device__ __constant__ F_CFLOAT* MY_AP(fp);
|
||||
|
||||
#define _rhor_spline_tex MY_AP(rhor_spline_tex)
|
||||
#if F_PRECISION == 1
|
||||
|
@ -115,10 +115,10 @@ inline void BindEAMTextures(cuda_shared_data* sdata)
|
|||
void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
|
||||
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT));
|
||||
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT);
|
||||
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -151,13 +151,13 @@ void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighl
|
|||
void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
|
||||
}
|
||||
|
||||
|
@ -175,18 +175,18 @@ void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, in
|
|||
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
|
||||
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
|
||||
|
||||
unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
unsigned nI = sizeof(F_CFLOAT) * cuda_ntypes * cuda_ntypes;
|
||||
|
||||
X_FLOAT cutsq_global;
|
||||
cutsq_global = (X_FLOAT)(sdata->pair.cut_global);
|
||||
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
|
||||
X_CFLOAT cutsq_global;
|
||||
cutsq_global = (X_CFLOAT)(sdata->pair.cut_global);
|
||||
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_CFLOAT));
|
||||
|
||||
|
||||
F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes];
|
||||
F_CFLOAT* coeff_buf = new F_CFLOAT[cuda_ntypes * cuda_ntypes];
|
||||
|
||||
for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i];
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_CFLOAT));
|
||||
|
||||
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i];
|
||||
|
||||
|
@ -197,34 +197,34 @@ void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, in
|
|||
cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI);
|
||||
|
||||
delete [] coeff_buf;
|
||||
X_FLOAT box_size[3] = {
|
||||
X_CFLOAT box_size[3] = {
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
F_FLOAT rdr_F = rdr;
|
||||
F_FLOAT rdrho_F = rdrho;
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
|
||||
F_CFLOAT rdr_F = rdr;
|
||||
F_CFLOAT rdrho_F = rdrho;
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
|
||||
|
||||
rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
|
||||
z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
|
||||
rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_CFLOAT);
|
||||
z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_CFLOAT);
|
||||
rhor_spline_pointer = rhor_spline;
|
||||
z2r_spline_pointer = z2r_spline;
|
||||
|
||||
|
@ -249,8 +249,8 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
|||
if(sdata->buffer_new)
|
||||
Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist);
|
||||
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
|
||||
int sharedperproc = 0;
|
||||
|
||||
|
@ -258,7 +258,7 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
|||
|
||||
if(vflag || vflag_atom) sharedperproc = 7;
|
||||
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
|
@ -270,7 +270,7 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
|||
|
||||
MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
|
||||
PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
|
||||
|
||||
|
@ -288,7 +288,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
|||
|
||||
if(vflag || vflag_atom) sharedperproc = 7;
|
||||
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
|
@ -300,7 +300,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
|||
|
||||
MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
|
||||
PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
|
||||
|
@ -310,7 +310,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
|
|||
grid.x = sharedperproc;
|
||||
grid.y = 1;
|
||||
threads.x = 256;
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)*sharedperproc>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
|
||||
}
|
||||
|
@ -324,19 +324,19 @@ void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void*
|
|||
int3 layout = getgrid(n, 0);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
|
||||
F_CFLOAT* buf = (F_CFLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
|
||||
|
||||
PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n
|
||||
, sdata->comm.maxlistlength, iswap, buf);
|
||||
cudaThreadSynchronize();
|
||||
cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(buf_send, buf, n* sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
|
||||
void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp)
|
||||
{
|
||||
F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
|
||||
cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice);
|
||||
F_CFLOAT* fp_first = &(((F_CFLOAT*) fp)[first]);
|
||||
cudaMemcpy(fp_first, buf_recv, n * sizeof(F_CFLOAT), cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
#undef _type2frho
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
|
||||
|
||||
static __device__ inline F_FLOAT4 fetchRhor(int i)
|
||||
static __device__ inline F_CFLOAT4 fetchRhor(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if F_PRECISION == 1
|
||||
|
@ -37,7 +37,7 @@ static __device__ inline F_FLOAT4 fetchRhor(int i)
|
|||
#endif
|
||||
}
|
||||
|
||||
static __device__ inline F_FLOAT4 fetchZ2r(int i)
|
||||
static __device__ inline F_CFLOAT4 fetchZ2r(int i)
|
||||
{
|
||||
#ifdef CUDA_USE_TEXTURE
|
||||
#if F_PRECISION == 1
|
||||
|
@ -52,8 +52,8 @@ static __device__ inline F_FLOAT4 fetchZ2r(int i)
|
|||
|
||||
__global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
ENERGY_FLOAT* sharedE;
|
||||
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
ENERGY_CFLOAT* sharedE;
|
||||
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
|
||||
|
||||
if(eflag || eflag_atom) {
|
||||
|
@ -73,9 +73,9 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_FLOAT4 myxtype;
|
||||
F_FLOAT delx, dely, delz;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT4 myxtype;
|
||||
F_CFLOAT delx, dely, delz;
|
||||
int itype;
|
||||
int i = _nlocal;
|
||||
int jnum = 0;
|
||||
|
@ -109,17 +109,17 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
|
|||
dely = ytmp - myxtype.y;
|
||||
delz = ztmp - myxtype.z;
|
||||
int jtype = static_cast <int>(myxtype.w);
|
||||
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq < _cutsq_global) {
|
||||
F_FLOAT p = sqrt(rsq) * _rdr + F_F(1.0);
|
||||
F_CFLOAT p = sqrt(rsq) * _rdr + F_F(1.0);
|
||||
int m = static_cast<int>(p);
|
||||
m = MIN(m, _nr - 1);
|
||||
p -= m;
|
||||
p = MIN(p, F_F(1.0));
|
||||
|
||||
int k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2;
|
||||
F_FLOAT4 c = fetchRhor(k + 1);
|
||||
F_CFLOAT4 c = fetchRhor(k + 1);
|
||||
_rho[i] += ((c.w * p + c.x) * p + c.y) * p + c.z;
|
||||
}
|
||||
}
|
||||
|
@ -127,12 +127,12 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
if(ii < _inum) {
|
||||
|
||||
F_FLOAT p = _rho[i] * _rdrho + F_F(1.0);
|
||||
F_CFLOAT p = _rho[i] * _rdrho + F_F(1.0);
|
||||
int m = static_cast<int>(p);
|
||||
m = MAX(1, MIN(m, _nrho - 1));
|
||||
p -= m;
|
||||
p = MIN(p, F_F(1.0));
|
||||
F_FLOAT* coeff = &_frho_spline[(static_cast <int>(_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH];
|
||||
F_CFLOAT* coeff = &_frho_spline[(static_cast <int>(_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH];
|
||||
_fp[i] = (coeff[0] * p + coeff[1]) * p + coeff[2];
|
||||
|
||||
if(eflag || eflag_atom) {
|
||||
|
@ -148,17 +148,17 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
|
|||
_eatom[i] += sharedmem[threadIdx.x];
|
||||
|
||||
reduceBlock(sharedmem);
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0) * sharedmem[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||||
{
|
||||
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
|
||||
|
||||
ENERGY_FLOAT* sharedE;
|
||||
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
ENERGY_CFLOAT* sharedE;
|
||||
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
|
||||
|
||||
if(eflag || eflag_atom) {
|
||||
|
@ -178,10 +178,10 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
|
|||
|
||||
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_FLOAT4 myxtype;
|
||||
F_FLOAT fxtmp, fytmp, fztmp, fpair;
|
||||
F_FLOAT delx, dely, delz;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT4 myxtype;
|
||||
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
|
||||
F_CFLOAT delx, dely, delz;
|
||||
int itype, i;
|
||||
int jnum = 0;
|
||||
int* jlist;
|
||||
|
@ -206,7 +206,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
|
|||
_rho[i] = F_F(0.0);
|
||||
}
|
||||
|
||||
if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_FLOAT*) _buffer)[ii];
|
||||
if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_CFLOAT*) _buffer)[ii];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
@ -219,35 +219,35 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
|
|||
dely = ytmp - myxtype.y;
|
||||
delz = ztmp - myxtype.z;
|
||||
int jtype = static_cast <int>(myxtype.w);
|
||||
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq < _cutsq_global) {
|
||||
F_FLOAT r = _SQRT_(rsq);
|
||||
F_FLOAT p = r * _rdr + F_F(1.0);
|
||||
F_CFLOAT r = _SQRT_(rsq);
|
||||
F_CFLOAT p = r * _rdr + F_F(1.0);
|
||||
int m = static_cast<int>(p);
|
||||
m = MIN(m, _nr - 1);
|
||||
p -= m;
|
||||
p = MIN(p, F_F(1.0));
|
||||
|
||||
int k = (static_cast <int>(_type2rhor[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2;
|
||||
F_FLOAT4 c = fetchRhor(k);
|
||||
F_FLOAT rhoip = (c.x * p + c.y) * p + c.z;
|
||||
F_CFLOAT4 c = fetchRhor(k);
|
||||
F_CFLOAT rhoip = (c.x * p + c.y) * p + c.z;
|
||||
k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2;
|
||||
c = fetchRhor(k);
|
||||
F_FLOAT rhojp = (c.x * p + c.y) * p + c.z;
|
||||
F_CFLOAT rhojp = (c.x * p + c.y) * p + c.z;
|
||||
k = (static_cast <int>(_type2z2r[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2;
|
||||
c = fetchZ2r(k);
|
||||
F_FLOAT z2p = (c.x * p + c.y) * p + c.z;
|
||||
F_CFLOAT z2p = (c.x * p + c.y) * p + c.z;
|
||||
c = fetchZ2r(k + 1);
|
||||
F_FLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z;
|
||||
F_CFLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z;
|
||||
|
||||
F_FLOAT recip = F_F(1.0) / r;
|
||||
F_FLOAT phi = z2 * recip;
|
||||
F_FLOAT phip = z2p * recip - phi * recip;
|
||||
F_FLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip;
|
||||
F_CFLOAT recip = F_F(1.0) / r;
|
||||
F_CFLOAT phi = z2 * recip;
|
||||
F_CFLOAT phip = z2p * recip - phi * recip;
|
||||
F_CFLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip;
|
||||
fpair = -psip * recip;
|
||||
|
||||
F_FLOAT dxfp, dyfp, dzfp;
|
||||
F_CFLOAT dxfp, dyfp, dzfp;
|
||||
fxtmp += dxfp = delx * fpair;
|
||||
fytmp += dyfp = dely * fpair;
|
||||
fztmp += dzfp = delz * fpair;
|
||||
|
@ -268,10 +268,10 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
|
|||
__syncthreads();
|
||||
|
||||
if(ii < _inum) {
|
||||
F_FLOAT* my_f;
|
||||
F_CFLOAT* my_f;
|
||||
|
||||
if(_collect_forces_later) {
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
|
||||
|
||||
if(eflag) {
|
||||
buffer = &buffer[1 * gridDim.x * gridDim.y];
|
||||
|
@ -281,7 +281,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
|
|||
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||||
}
|
||||
|
||||
my_f = (F_FLOAT*) buffer;
|
||||
my_f = (F_CFLOAT*) buffer;
|
||||
my_f += i;
|
||||
*my_f = fxtmp;
|
||||
my_f += _nmax;
|
||||
|
@ -320,7 +320,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
|
|||
if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0);
|
||||
}
|
||||
|
||||
__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_FLOAT* buffer)
|
||||
__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_CFLOAT* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
int* list = sendlist + iswap * maxlistlength;
|
||||
|
@ -331,7 +331,7 @@ __global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlen
|
|||
}
|
||||
}
|
||||
|
||||
__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_FLOAT* buffer)
|
||||
__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_CFLOAT* buffer)
|
||||
{
|
||||
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||||
|
||||
|
|
|
@ -37,10 +37,10 @@
|
|||
void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
||||
{
|
||||
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed");
|
||||
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_FLOAT));
|
||||
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_CFLOAT));
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_FLOAT);
|
||||
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_CFLOAT);
|
||||
|
||||
if(sdata->buffersize < size) {
|
||||
MYDBG(printf("Cuda_PairGranHookeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
|
||||
|
@ -72,15 +72,15 @@ void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neig
|
|||
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_CFLOAT4*));
|
||||
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(maxneighbors), &sneighlist->maxneighbors , sizeof(int));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*));
|
||||
cudaMemcpyToSymbol(MY_AP(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int));
|
||||
|
||||
|
@ -101,32 +101,32 @@ void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata)
|
|||
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1);
|
||||
|
||||
unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
|
||||
unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
|
||||
unsigned n = sizeof(F_CFLOAT) * cuda_ntypes2;
|
||||
|
||||
F_FLOAT coeffs1[cuda_ntypes2];
|
||||
coeffs1[0] = (F_FLOAT) sdata->pair.coeff1[0][0];
|
||||
coeffs1[1] = (F_FLOAT) sdata->pair.coeff1[0][1];
|
||||
coeffs1[2] = (F_FLOAT) sdata->pair.coeff1[1][0];
|
||||
F_FLOAT coeffs3[cuda_ntypes2];
|
||||
coeffs3[0] = (F_FLOAT) sdata->pair.coeff1[1][1];
|
||||
F_FLOAT coeffs2[cuda_ntypes2];
|
||||
coeffs2[0] = (F_FLOAT) sdata->pair.coeff2[0][0];
|
||||
coeffs2[1] = (F_FLOAT) sdata->pair.coeff2[0][1];
|
||||
F_CFLOAT coeffs1[cuda_ntypes2];
|
||||
coeffs1[0] = (F_CFLOAT) sdata->pair.coeff1[0][0];
|
||||
coeffs1[1] = (F_CFLOAT) sdata->pair.coeff1[0][1];
|
||||
coeffs1[2] = (F_CFLOAT) sdata->pair.coeff1[1][0];
|
||||
F_CFLOAT coeffs3[cuda_ntypes2];
|
||||
coeffs3[0] = (F_CFLOAT) sdata->pair.coeff1[1][1];
|
||||
F_CFLOAT coeffs2[cuda_ntypes2];
|
||||
coeffs2[0] = (F_CFLOAT) sdata->pair.coeff2[0][0];
|
||||
coeffs2[1] = (F_CFLOAT) sdata->pair.coeff2[0][1];
|
||||
|
||||
|
||||
X_FLOAT box_size[3] = {
|
||||
X_CFLOAT box_size[3] = {
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
//printf("n: %i %i\n",n,CUDA_MAX_TYPES2);
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3);
|
||||
cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned));
|
||||
cudaMemcpyToSymbol(MY_AP(coeff1) , coeffs1 , n);
|
||||
cudaMemcpyToSymbol(MY_AP(coeff2) , coeffs2 , n);
|
||||
cudaMemcpyToSymbol(MY_AP(coeff3) , coeffs3 , n);
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*));
|
||||
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
|
||||
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed");
|
||||
}
|
||||
|
@ -156,7 +156,7 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
|
|||
|
||||
if(vflag) sharedperproc += 6;
|
||||
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT), 128);
|
||||
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT), 128);
|
||||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
|
@ -168,11 +168,11 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
|
|||
Cuda_PairGranHookeCuda_Init(sdata);
|
||||
}
|
||||
|
||||
MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
|
||||
MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);)
|
||||
|
||||
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation");
|
||||
PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id
|
||||
, (F_FLOAT) sdata->pair.coeff1[0][0], (F_FLOAT) sdata->pair.coeff1[1][0], (F_FLOAT) sdata->pair.coeff1[1][1], (F_FLOAT) sdata->pair.coeff2[0][0]);
|
||||
PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id
|
||||
, (F_CFLOAT) sdata->pair.coeff1[0][0], (F_CFLOAT) sdata->pair.coeff1[1][0], (F_CFLOAT) sdata->pair.coeff1[1][1], (F_CFLOAT) sdata->pair.coeff2[0][0]);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed");
|
||||
|
||||
|
@ -181,7 +181,7 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
|
|||
grid.x = sharedperproc;
|
||||
grid.y = 1;
|
||||
threads.x = 256;
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
|
||||
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed");
|
||||
}
|
||||
|
|
|
@ -23,12 +23,12 @@
|
|||
|
||||
|
||||
__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id
|
||||
, F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu)
|
||||
, F_CFLOAT kn, F_CFLOAT gamman, F_CFLOAT gammat, F_CFLOAT xmu)
|
||||
{
|
||||
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||||
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
|
||||
|
||||
ENERGY_FLOAT* sharedE;
|
||||
ENERGY_FLOAT* sharedV;
|
||||
ENERGY_CFLOAT* sharedE;
|
||||
ENERGY_CFLOAT* sharedV;
|
||||
|
||||
if(eflag || eflag_atom) {
|
||||
sharedE = &sharedmem[threadIdx.x];
|
||||
|
@ -51,18 +51,18 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
|
|||
|
||||
MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");)
|
||||
|
||||
X_FLOAT xtmp, ytmp, ztmp;
|
||||
X_CFLOAT xtmp, ytmp, ztmp;
|
||||
|
||||
X_FLOAT4 myxtype;
|
||||
V_FLOAT4 myvradius, ovradius;
|
||||
F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp;
|
||||
F_FLOAT delx, dely, delz;
|
||||
F_FLOAT radi, radj, radsum, r, rsqinv;
|
||||
F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
|
||||
F_FLOAT wr1, wr2, wr3;
|
||||
F_FLOAT vtr1, vtr2, vtr3, vrel;
|
||||
F_FLOAT meff, damp, ccel, tor1, tor2, tor3;
|
||||
F_FLOAT fn, fs, ft, fs1, fs2, fs3;
|
||||
X_CFLOAT4 myxtype;
|
||||
V_CFLOAT4 myvradius, ovradius;
|
||||
F_CFLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp;
|
||||
F_CFLOAT delx, dely, delz;
|
||||
F_CFLOAT radi, radj, radsum, r, rsqinv;
|
||||
F_CFLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
|
||||
F_CFLOAT wr1, wr2, wr3;
|
||||
F_CFLOAT vtr1, vtr2, vtr3, vrel;
|
||||
F_CFLOAT meff, damp, ccel, tor1, tor2, tor3;
|
||||
F_CFLOAT fn, fs, ft, fs1, fs2, fs3;
|
||||
|
||||
int jnum = 0;
|
||||
int i, j;
|
||||
|
@ -108,10 +108,10 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
|
|||
radj = ovradius.w;
|
||||
radsum = radi + radj;
|
||||
|
||||
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq < radsum * radsum) {
|
||||
const F_FLOAT rinv = _RSQRT_(rsq);
|
||||
const F_CFLOAT rinv = _RSQRT_(rsq);
|
||||
r = F_F(1.0) / rinv;
|
||||
rsqinv = F_F(1.0) / rsq;
|
||||
|
||||
|
@ -135,8 +135,8 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
|
|||
vt3 = vr3 - vn3;
|
||||
|
||||
// relative rotational velocity
|
||||
V_FLOAT4 omegarmass_i = fetchOmegaRmass(i);
|
||||
V_FLOAT4 omegarmass_j = fetchOmegaRmass(j);
|
||||
V_CFLOAT4 omegarmass_i = fetchOmegaRmass(i);
|
||||
V_CFLOAT4 omegarmass_j = fetchOmegaRmass(j);
|
||||
|
||||
wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv;
|
||||
wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv;
|
||||
|
@ -165,7 +165,7 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
|
|||
fs2 = -ft * vtr2;
|
||||
fs3 = -ft * vtr3;
|
||||
|
||||
F_FLOAT dxfp, dyfp, dzfp;
|
||||
F_CFLOAT dxfp, dyfp, dzfp;
|
||||
fxtmp += dxfp = delx * ccel + fs1;
|
||||
fytmp += dyfp = dely * ccel + fs2;
|
||||
fztmp += dzfp = delz * ccel + fs3;
|
||||
|
@ -194,13 +194,13 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
|
|||
__syncthreads();
|
||||
|
||||
if(ii < _inum) {
|
||||
F_FLOAT* my_f = _f + i;
|
||||
F_CFLOAT* my_f = _f + i;
|
||||
*my_f += fxtmp;
|
||||
my_f += _nmax;
|
||||
*my_f += fytmp;
|
||||
my_f += _nmax;
|
||||
*my_f += fztmp;
|
||||
F_FLOAT* my_torque = _torque + i;
|
||||
F_CFLOAT* my_torque = _torque + i;
|
||||
*my_torque += torquextmp;
|
||||
my_torque += _nmax;
|
||||
*my_torque += torqueytmp;
|
||||
|
|
|
@ -63,10 +63,10 @@ void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -21,12 +21,12 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
|
||||
__device__ inline F_CFLOAT PairLJ96CutCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_FLOAT r3inv = _SQRT_(r6inv);
|
||||
const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]);
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_CFLOAT r3inv = _SQRT_(r6inv);
|
||||
const F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]);
|
||||
|
||||
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv - _lj4[ij_type]) - _offset[ij_type]);
|
||||
|
||||
|
|
|
@ -33,12 +33,12 @@
|
|||
|
||||
#include <time.h>
|
||||
|
||||
void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv)
|
||||
void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_lj_inv, F_CFLOAT denom_coul_inv)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_CFLOAT));
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_c
|
|||
|
||||
|
||||
void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
|
||||
int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul)
|
||||
int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul)
|
||||
{
|
||||
|
||||
static short init = 0;
|
||||
|
@ -65,10 +65,10 @@ void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighl
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -23,4 +23,4 @@
|
|||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul);
|
||||
extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul);
|
||||
|
|
|
@ -20,24 +20,24 @@
|
|||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
|
||||
__device__ inline F_CFLOAT PairLJCharmmCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
|
||||
F_FLOAT philj, switch1;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
|
||||
F_CFLOAT philj, switch1;
|
||||
|
||||
if(rsq > _cut_innersq_global) {
|
||||
switch1 = (_cutsq_global - rsq) * (_cutsq_global - rsq) *
|
||||
(_cutsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_innersq_global) * _denom_lj_inv;
|
||||
const F_FLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) *
|
||||
const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) *
|
||||
(rsq - _cut_innersq_global) * _denom_lj_inv;
|
||||
philj = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]);
|
||||
forcelj = forcelj * switch1 + philj * switch2;
|
||||
}
|
||||
|
||||
if(eflag) {
|
||||
ENERGY_FLOAT evdwl_tmp = factor_lj;
|
||||
ENERGY_CFLOAT evdwl_tmp = factor_lj;
|
||||
|
||||
if(rsq > _cut_innersq_global) {
|
||||
evdwl_tmp *= philj * switch1;
|
||||
|
@ -50,16 +50,16 @@ __device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij
|
|||
return factor_lj * forcelj * r2inv;
|
||||
}
|
||||
|
||||
__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
|
||||
__device__ inline F_CFLOAT CoulCharmmCuda_Eval(const F_CFLOAT &rsq, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij)
|
||||
{
|
||||
F_FLOAT forcecoul;
|
||||
ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul;
|
||||
F_CFLOAT forcecoul;
|
||||
ENERGY_CFLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul;
|
||||
|
||||
if(rsq > _cut_coul_innersq_global) {
|
||||
const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
|
||||
const F_CFLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
|
||||
(_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv;
|
||||
ecoul_tmp *= switch1;
|
||||
const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
|
||||
const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
|
||||
(rsq - _cut_coul_innersq_global) * _denom_coul_inv;
|
||||
forcecoul *= switch1 + switch2;
|
||||
}
|
||||
|
|
|
@ -30,9 +30,9 @@
|
|||
#define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global)
|
||||
#define _denom_lj_inv MY_AP(denom_lj_inv)
|
||||
#define _denom_coul_inv MY_AP(denom_coul_inv)
|
||||
__device__ __constant__ F_FLOAT _cut_coul_innersq_global;
|
||||
__device__ __constant__ F_FLOAT _denom_lj_inv;
|
||||
__device__ __constant__ F_FLOAT _denom_coul_inv;
|
||||
__device__ __constant__ F_CFLOAT _cut_coul_innersq_global;
|
||||
__device__ __constant__ F_CFLOAT _denom_lj_inv;
|
||||
__device__ __constant__ F_CFLOAT _denom_coul_inv;
|
||||
|
||||
|
||||
#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h"
|
||||
|
@ -40,12 +40,12 @@ __device__ __constant__ F_FLOAT _denom_coul_inv;
|
|||
|
||||
#include <time.h>
|
||||
|
||||
void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv)
|
||||
void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_lj_inv, F_CFLOAT denom_coul_inv)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_CFLOAT));
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLO
|
|||
|
||||
|
||||
void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
|
||||
int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul)
|
||||
int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul)
|
||||
{
|
||||
|
||||
static short init = 0;
|
||||
|
@ -72,10 +72,10 @@ void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_share
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -23,4 +23,4 @@
|
|||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul);
|
||||
extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul);
|
||||
|
|
|
@ -21,16 +21,16 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
|
||||
__device__ inline F_CFLOAT CoulCharmmImplicitCuda_Eval(const F_CFLOAT &rsq, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij)
|
||||
{
|
||||
F_FLOAT forcecoul;
|
||||
ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul;
|
||||
F_CFLOAT forcecoul;
|
||||
ENERGY_CFLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul;
|
||||
|
||||
if(rsq > _cut_coul_innersq_global) {
|
||||
const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
|
||||
const F_CFLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
|
||||
(_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv;
|
||||
ecoul_tmp *= switch1;
|
||||
const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
|
||||
const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
|
||||
(rsq - _cut_coul_innersq_global) * _denom_coul_inv;
|
||||
forcecoul *= (switch1 + switch2);
|
||||
}
|
||||
|
|
|
@ -32,10 +32,10 @@
|
|||
|
||||
#include <time.h>
|
||||
|
||||
void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_lj_inv)
|
||||
void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_CFLOAT denom_lj_inv)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
|
||||
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT));
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_l
|
|||
|
||||
|
||||
void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
|
||||
int eflag_atom, int vflag_atom, F_FLOAT denom_lj)
|
||||
int eflag_atom, int vflag_atom, F_CFLOAT denom_lj)
|
||||
{
|
||||
|
||||
static short init = 0;
|
||||
|
@ -62,10 +62,10 @@ void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlis
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -23,4 +23,4 @@
|
|||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj);
|
||||
extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj);
|
||||
|
|
|
@ -58,10 +58,10 @@ void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -58,10 +58,10 @@ void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlis
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig
|
|||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
|
||||
//int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT);
|
||||
//if(CUDA_ARCH==20) maxthreads*=2;
|
||||
//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
|
||||
|
@ -60,10 +60,10 @@ void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -21,11 +21,11 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
|
||||
__device__ inline F_CFLOAT PairLJClass2Cuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_FLOAT r3inv = _SQRT_(r6inv);
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_CFLOAT r3inv = _SQRT_(r6inv);
|
||||
|
||||
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv -
|
||||
_lj4[ij_type]) - _offset[ij_type]);
|
||||
|
|
|
@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli
|
|||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
|
||||
//int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT);
|
||||
//if(CUDA_ARCH==20) maxthreads*=2;
|
||||
//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
|
||||
|
@ -60,10 +60,10 @@ void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -21,10 +21,10 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
|
||||
__device__ inline F_CFLOAT PairLJCutCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rsq;
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
|
||||
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv -
|
||||
_lj4[ij_type]) - _offset[ij_type]);
|
||||
|
|
|
@ -51,7 +51,7 @@ void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighli
|
|||
dim3 grid, threads;
|
||||
int sharedperproc;
|
||||
|
||||
//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
|
||||
//int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT);
|
||||
//if(CUDA_ARCH==20) maxthreads*=2;
|
||||
//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
|
||||
|
@ -64,10 +64,10 @@ void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighli
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA_opt<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase);
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
||||
|
|
|
@ -62,10 +62,10 @@ void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -20,14 +20,14 @@
|
|||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
|
||||
__device__ inline F_CFLOAT PairLJExpandCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
|
||||
{
|
||||
const F_FLOAT r = _SQRT_(rsq);
|
||||
const F_FLOAT rshift = r - _shift[ij_type];
|
||||
const F_FLOAT rshiftsq = rshift * rshift;
|
||||
const F_FLOAT r2inv = F_F(1.0) / rshiftsq;
|
||||
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
|
||||
const F_CFLOAT r = _SQRT_(rsq);
|
||||
const F_CFLOAT rshift = r - _shift[ij_type];
|
||||
const F_CFLOAT rshiftsq = rshift * rshift;
|
||||
const F_CFLOAT r2inv = F_F(1.0) / rshiftsq;
|
||||
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
|
||||
const F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
|
||||
|
||||
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - _offset[ij_type]);
|
||||
|
||||
|
|
|
@ -37,10 +37,10 @@
|
|||
#define _coulsw1 MY_AP(coulsw1)
|
||||
#define _coulsw2 MY_AP(coulsw2)
|
||||
#define _coulsw5 MY_AP(coulsw5)
|
||||
__device__ __constant__ F_FLOAT _cut_coul_inner_global;
|
||||
__device__ __constant__ F_FLOAT _coulsw1;
|
||||
__device__ __constant__ F_FLOAT _coulsw2;
|
||||
__device__ __constant__ F_FLOAT _coulsw5;
|
||||
__device__ __constant__ F_CFLOAT _cut_coul_inner_global;
|
||||
__device__ __constant__ F_CFLOAT _coulsw1;
|
||||
__device__ __constant__ F_CFLOAT _coulsw2;
|
||||
__device__ __constant__ F_CFLOAT _coulsw5;
|
||||
|
||||
|
||||
#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h"
|
||||
|
@ -48,13 +48,13 @@ __device__ __constant__ F_FLOAT _coulsw5;
|
|||
|
||||
#include <time.h>
|
||||
|
||||
void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5)
|
||||
void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5)
|
||||
{
|
||||
Cuda_Pair_Init_AllStyles(sdata, 9, true, true, true);
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1 , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2 , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5 , sizeof(F_FLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1 , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2 , sizeof(F_CFLOAT));
|
||||
cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5 , sizeof(F_CFLOAT));
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -62,7 +62,7 @@ void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut
|
|||
|
||||
|
||||
void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
|
||||
int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5)
|
||||
int eflag_atom, int vflag_atom, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5)
|
||||
{
|
||||
static short init = 0;
|
||||
|
||||
|
@ -80,10 +80,10 @@ void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neig
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
|
@ -23,4 +23,4 @@
|
|||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5);
|
||||
extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5);
|
||||
|
|
|
@ -21,23 +21,23 @@
|
|||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
|
||||
__device__ inline F_CFLOAT CoulGromacsCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij)
|
||||
{
|
||||
if(qij != F_F(0.0)) {
|
||||
F_FLOAT ecoul_tmp;
|
||||
F_FLOAT forcecoul = _RSQRT_(rsq);
|
||||
F_CFLOAT ecoul_tmp;
|
||||
F_CFLOAT forcecoul = _RSQRT_(rsq);
|
||||
|
||||
if(eflag) ecoul_tmp = forcecoul - _coulsw5;
|
||||
|
||||
if(rsq > _cut_coul_inner_global * _cut_coul_inner_global) {
|
||||
const F_FLOAT r = F_F(1.0) / forcecoul;
|
||||
const F_FLOAT tc = r - _cut_coul_inner_global;
|
||||
const F_CFLOAT r = F_F(1.0) / forcecoul;
|
||||
const F_CFLOAT tc = r - _cut_coul_inner_global;
|
||||
forcecoul += r * tc * tc * (_coulsw1 + _coulsw2 * tc);
|
||||
|
||||
if(eflag) ecoul_tmp -= tc * tc * tc * (_coulsw1 * (F_F(1.0) / F_F(3.0)) + _coulsw2 * tc * (F_F(1.0) / F_F(4.0)));
|
||||
}
|
||||
|
||||
F_FLOAT qprod = _qqrd2e * qij * factor_coul;
|
||||
F_CFLOAT qprod = _qqrd2e * qij * factor_coul;
|
||||
forcecoul *= qprod;
|
||||
|
||||
if(eflag) {
|
||||
|
|
|
@ -64,10 +64,10 @@ void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
|
|||
|
||||
if(sdata->pair.use_block_per_atom)
|
||||
Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
else
|
||||
Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE>
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue