git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12588 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2014-10-06 22:59:05 +00:00
parent 4bb43ca885
commit 621fa7d600
122 changed files with 1934 additions and 1929 deletions

View File

@ -6,7 +6,7 @@ precision ?= 1
verbose ?= 1
#GPU architecture (compute capability): 13, 20, 21, 35
arch ?= 21
arch ?= 20
#Using cufft (should not be changed)
cufft ?= 1

View File

@ -85,15 +85,15 @@ void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*));
if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*));
@ -121,9 +121,9 @@ void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");)
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*));
cudaThreadSynchronize();
MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");)
@ -143,14 +143,14 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = (n * n_data_items) * sizeof(X_FLOAT);
int size = (n * n_data_items) * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -185,8 +185,8 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
@ -216,16 +216,16 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = (n * n_data_items) * sizeof(X_FLOAT);
int size = (n * n_data_items) * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
static int count = -1;
count++;
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -276,7 +276,7 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = (n * n_data_items) * sizeof(X_FLOAT);
int size = (n * n_data_items) * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
@ -289,7 +289,7 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
@ -463,14 +463,14 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = nsend * n_data_items * sizeof(X_FLOAT);
int size = nsend * n_data_items * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -522,14 +522,14 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap,
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = n * n_data_items * sizeof(X_FLOAT);
int size = n * n_data_items * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -584,7 +584,7 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi
int n_data_items = AtomVecCuda_CountDataItems(data_mask);
int size = n * n_data_items * sizeof(X_FLOAT);
int size = n * n_data_items * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_AtomVecCuda_UpdateBuffer(sdata, size);

View File

@ -27,7 +27,7 @@
extern __shared__ int shared[];
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -40,44 +40,44 @@ __global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxli
int k = 0;
if(data_mask & X_MASK) {
((X_FLOAT*) buffer)[i + k * n] = _x[j] + dx;
((X_CFLOAT*) buffer)[i + k * n] = _x[j] + dx;
k++;
((X_FLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
((X_CFLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
k++;
((X_FLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
((X_CFLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
k++;
}
if(data_mask & V_MASK) {
((X_FLOAT*) buffer)[i + k * n] = _v[j];
((X_CFLOAT*) buffer)[i + k * n] = _v[j];
k++;
((X_FLOAT*) buffer)[i + k * n] = _v[j + _nmax];
((X_CFLOAT*) buffer)[i + k * n] = _v[j + _nmax];
k++;
((X_FLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
((X_CFLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
k++;
}
if(data_mask & OMEGA_MASK) {
((X_FLOAT*) buffer)[i + k * n] = _omega[j];
((X_CFLOAT*) buffer)[i + k * n] = _omega[j];
k++;
((X_FLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
k++;
((X_FLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
((X_CFLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
k++;
}
if(data_mask & RADIUS_MASK)((X_FLOAT*) buffer)[i + k * n] = _radius[j];
if(data_mask & RADIUS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _radius[j];
k++;
if(data_mask & RMASS_MASK)((X_FLOAT*) buffer)[i + k * n] = _rmass[j];
if(data_mask & RMASS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _rmass[j];
k++;
}
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
@ -121,37 +121,37 @@ __global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffe
int k = 0;
if(data_mask & X_MASK) {
_x[i + first] = ((X_FLOAT*) buffer)[i + k * n];
_x[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
if(data_mask & V_MASK) {
_v[i + first] = ((X_FLOAT*) buffer)[i + k * n];
_v[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
if(data_mask & OMEGA_MASK) {
_omega[i + first] = ((X_FLOAT*) buffer)[i + k * n];
_omega[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_omega[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
_omega[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
_omega[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) buffer)[i + k * n];
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) buffer)[i + k * n];
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
k++;
}
@ -163,8 +163,8 @@ __global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim)
double* buf = (double*) _buffer;
buf = &buf[1];
//X_FLOAT lo=slablo[iswap];
//X_FLOAT hi=slabhi[iswap];
//X_CFLOAT lo=slablo[iswap];
//X_CFLOAT hi=slabhi[iswap];
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
bool add = false;
@ -369,7 +369,7 @@ __global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int*
}
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz)
__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -379,37 +379,37 @@ __global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int max
int m = 0;
if(data_mask & X_MASK) {
((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
}
if(data_mask & V_MASK) {
((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j];
((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
}
if(data_mask & TAG_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _tag[j];
if(data_mask & TAG_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _tag[j];
if(data_mask & TYPE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _type[j];
if(data_mask & TYPE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _type[j];
if(data_mask & MASK_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _mask[j];
if(data_mask & MASK_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _mask[j];
if(data_mask & Q_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _q[j];
if(data_mask & Q_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _q[j];
if(data_mask & MOLECULE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
if(data_mask & MOLECULE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
if(data_mask & RADIUS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _radius[i];
if(data_mask & RADIUS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _radius[i];
if(data_mask & DENSITY_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _density[i];
if(data_mask & DENSITY_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _density[i];
if(data_mask & RMASS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
if(data_mask & RMASS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
if(data_mask & OMEGA_MASK) {
((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i];
((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
}
}
}
@ -417,7 +417,7 @@ __global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int max
template <const unsigned int data_mask>
__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -471,37 +471,37 @@ __global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first)
int m = 0;
if(data_mask & X_MASK) {
_x[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_x[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_x[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_x[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_x[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
}
if(data_mask & V_MASK) {
_v[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_v[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_v[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_v[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_v[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
}
if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & Q_MASK) _q[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
if(data_mask & Q_MASK) _q[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & DENSITY_MASK) _density[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
if(data_mask & DENSITY_MASK) _density[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
if(data_mask & OMEGA_MASK) {
_omega[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_omega[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_omega[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
_omega[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_omega[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
_omega[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
}
} else {
_flag[0] = 1;

View File

@ -34,7 +34,7 @@
void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n)
{
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -53,9 +53,9 @@ void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
@ -65,7 +65,7 @@ void Cuda_CommCuda_Init(cuda_shared_data* sdata)
Cuda_CommCuda_UpdateNmax(sdata);
int ntypesp = sdata->atom.ntypes + 1;
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &ntypesp, sizeof(int));
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata, sizeof(int*));
}
@ -81,14 +81,14 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -123,8 +123,8 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
@ -151,14 +151,14 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 6 * sizeof(X_FLOAT);
int size = n * 6 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -193,8 +193,8 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
if(not sdata->overlap_comm)
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
@ -221,16 +221,16 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
static int count = -1;
count++;
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -278,16 +278,16 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 6 * sizeof(X_FLOAT);
int size = n * 6 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
static int count = -1;
count++;
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -334,7 +334,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
@ -347,7 +347,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
@ -375,7 +375,7 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 6 * sizeof(X_FLOAT);
int size = n * 6 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
@ -388,7 +388,7 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
@ -414,22 +414,22 @@ int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* b
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(F_FLOAT);
int size = n * 3 * sizeof(F_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
F_FLOAT* buf = (F_FLOAT*)buf_send;
F_FLOAT* f_dev = (F_FLOAT*)sdata->atom.f.dev_data;
F_CFLOAT* buf = (F_CFLOAT*)buf_send;
F_CFLOAT* f_dev = (F_CFLOAT*)sdata->atom.f.dev_data;
f_dev += first;
cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
buf += n;
f_dev += sdata->atom.nmax;
cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
buf += n;
f_dev += sdata->atom.nmax;
cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
return n * 3;
}
@ -442,7 +442,7 @@ void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(F_FLOAT);
int size = n * 3 * sizeof(F_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
@ -468,7 +468,7 @@ void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap,
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_CommCuda_UpdateBuffer(sdata, n);
@ -520,9 +520,9 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in
my_gettime(CLOCK_REALTIME, &time1);
if(style == 1)
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.slablo.dev_data, (X_CFLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
else
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.multilo.dev_data, (X_CFLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
cudaThreadSynchronize();
my_gettime(CLOCK_REALTIME, &time2);

View File

@ -21,7 +21,7 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -31,13 +31,13 @@ __global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistl
if(j > _nmax) _flag[0] = 1;
((X_FLOAT*) buffer)[i] = _x[j] + dx;
((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
}
}
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -47,16 +47,16 @@ __global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxli
if(j > _nmax) _flag[0] = 1;
((X_FLOAT*) buffer)[i] = _x[j] + dx;
((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
((X_FLOAT*) buffer)[i + 3 * n] = _v[j];
((X_FLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
((X_FLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
((X_CFLOAT*) buffer)[i] = _x[j] + dx;
((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
((X_CFLOAT*) buffer)[i + 3 * n] = _v[j];
((X_CFLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
((X_CFLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
}
}
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
@ -72,7 +72,7 @@ __global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int max
}
}
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
@ -96,9 +96,9 @@ __global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer)
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
_x[i + first] = ((X_FLOAT*) buffer)[i];
_x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n];
_x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n];
_x[i + first] = ((X_CFLOAT*) buffer)[i];
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
}
}
@ -108,12 +108,12 @@ __global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffe
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
_x[i + first] = ((X_FLOAT*) buffer)[i];
_x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n];
_x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n];
_v[i + first] = ((X_FLOAT*) buffer)[i + 3 * n];
_v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 4 * n];
_v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 5 * n];
_x[i + first] = ((X_CFLOAT*) buffer)[i];
_x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
_x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
_v[i + first] = ((X_CFLOAT*) buffer)[i + 3 * n];
_v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 4 * n];
_v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 5 * n];
}
}
@ -122,9 +122,9 @@ __global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first)
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
((F_FLOAT*) _buffer)[i] = _f[i + first];
((F_FLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
((F_FLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
((F_CFLOAT*) _buffer)[i] = _f[i + first];
((F_CFLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
((F_CFLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
}
}
@ -136,9 +136,9 @@ __global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int max
if(i < n) {
int j = list[i];
_f[j] += ((F_FLOAT*)_buffer)[i];
_f[j + _nmax] += ((F_FLOAT*) _buffer)[i + n];
_f[j + 2 * _nmax] += ((F_FLOAT*) _buffer)[i + 2 * n];
_f[j] += ((F_CFLOAT*)_buffer)[i];
_f[j + _nmax] += ((F_CFLOAT*) _buffer)[i + n];
_f[j + 2 * _nmax] += ((F_CFLOAT*) _buffer)[i + 2 * n];
}
}
@ -161,11 +161,11 @@ __global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, in
extern __shared__ int shared[];
__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst,
int nfirst, int nlast, int dim, int iswap, X_FLOAT* slablo, X_FLOAT* slabhi, int* sendlist, int maxlistlength)
int nfirst, int nlast, int dim, int iswap, X_CFLOAT* slablo, X_CFLOAT* slabhi, int* sendlist, int maxlistlength)
{
int* list = sendlist + iswap * maxlistlength;
X_FLOAT lo = slablo[iswap];
X_FLOAT hi = slabhi[iswap];
X_CFLOAT lo = slablo[iswap];
X_CFLOAT hi = slabhi[iswap];
bool add = false;
if(!bordergroup || ineed >= 2) {
@ -273,11 +273,11 @@ __global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, i
__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst
, int nfirst, int nlast, int dim, int iswap, X_FLOAT* multilo, X_FLOAT* multihi, int* sendlist, int maxlistlength)
, int nfirst, int nlast, int dim, int iswap, X_CFLOAT* multilo, X_CFLOAT* multihi, int* sendlist, int maxlistlength)
{
int* list = sendlist + iswap * maxlistlength;
X_FLOAT* mlo = &multilo[iswap * _cuda_ntypes];
X_FLOAT* mhi = &multihi[iswap * _cuda_ntypes];
X_CFLOAT* mlo = &multilo[iswap * _cuda_ntypes];
X_CFLOAT* mhi = &multihi[iswap * _cuda_ntypes];
int itype = 0;
bool add = false;

View File

@ -33,7 +33,7 @@
void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -50,15 +50,15 @@ void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
if(sdata->atom.rmass_flag)
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
@ -68,7 +68,7 @@ void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
}
void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t)
void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempCuda_UpdateNmax(sdata);
@ -82,7 +82,7 @@ void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_F
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit);
Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
@ -90,13 +90,13 @@ void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_F
grid.x = 6;
grid.y = 1;
threads.x = 512;
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
}
}
void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t)
void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempCuda_UpdateNmax(sdata);
@ -111,7 +111,7 @@ void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_F
if(sdata->atom.nlocal > 0) {
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit);
Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
@ -119,7 +119,7 @@ void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_F
grid.x = 1;
grid.y = 1;
threads.x = 512;
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
}

View File

@ -24,5 +24,5 @@
#include "cuda_shared.h"
extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t);
extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t);
extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);

View File

@ -21,7 +21,7 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_FLOAT sharedmem[];
extern __shared__ ENERGY_CFLOAT sharedmem[];
__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
@ -40,7 +40,7 @@ __global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
}
reduceBlock(sharedmem);
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
@ -59,7 +59,7 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
if(i < _nlocal)
if(_mask[i] & groupbit) {
V_FLOAT massone;
V_CFLOAT massone;
if(_rmass_flag) massone = _rmass[i];
else massone = _mass[_type[i]];
@ -78,7 +78,7 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
reduceBlock(&sharedmem[3 * blockDim.x]);
reduceBlock(&sharedmem[4 * blockDim.x]);
reduceBlock(&sharedmem[5 * blockDim.x]);
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
@ -91,12 +91,12 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
}
__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t)
__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
ENERGY_FLOAT myforig = 0.0;
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT myforig = 0.0;
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {

View File

@ -33,7 +33,7 @@
void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -50,15 +50,15 @@ void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
if(sdata->atom.rmass_flag)
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
@ -68,7 +68,7 @@ void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
}
void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
@ -82,20 +82,20 @@ void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, E
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
int oldgrid = grid.x * grid.y;
grid.x = 6;
threads.x = 512;
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
}
}
void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
{
//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
@ -110,14 +110,14 @@ void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, E
if(sdata->atom.nlocal > 0) {
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
int oldgrid = grid.x * grid.y;
grid.x = 1;
threads.x = 512;
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
}
@ -137,7 +137,7 @@ void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int grou
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
}
@ -157,7 +157,7 @@ void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int gro
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
}

View File

@ -24,7 +24,7 @@
#include "cuda_shared.h"
extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);

View File

@ -21,7 +21,7 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_FLOAT sharedmem[];
extern __shared__ ENERGY_CFLOAT sharedmem[];
__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag)
@ -40,7 +40,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xfla
}
reduceBlock(sharedmem);
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
@ -59,7 +59,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla
if(i < _nlocal)
if(_mask[i] & groupbit) {
V_FLOAT massone;
V_CFLOAT massone;
if(_rmass_flag) massone = _rmass[i];
else massone = _mass[_type[i]];
@ -78,7 +78,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla
reduceBlock(&sharedmem[3 * blockDim.x]);
reduceBlock(&sharedmem[4 * blockDim.x]);
reduceBlock(&sharedmem[5 * blockDim.x]);
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
@ -91,12 +91,12 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xfla
}
__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t)
__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
ENERGY_FLOAT myforig = 0.0;
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT myforig = 0.0;
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
@ -117,7 +117,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t
t[blockIdx.x] = myforig;
}
__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall)
__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
@ -140,7 +140,7 @@ __global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, i
}
}
__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall)
__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

View File

@ -640,11 +640,11 @@ static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
return __hiloint2double(v.y, v.x);
}
static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
static __device__ inline X_CFLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
X_FLOAT4 w;
X_CFLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
@ -664,7 +664,7 @@ inline void BindXTypeTexture(cuda_shared_data* sdata)
#if X_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
@ -672,7 +672,7 @@ inline void BindXTypeTexture(cuda_shared_data* sdata)
#endif
}
static __device__ inline X_FLOAT4 fetchXType(int i)
static __device__ inline X_CFLOAT4 fetchXType(int i)
{
#ifdef CUDA_USE_TEXTURE
#if X_PRECISION == 1
@ -692,11 +692,11 @@ static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
return __hiloint2double(v.y, v.x);
}
static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
static __device__ inline V_CFLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
V_FLOAT4 w;
V_CFLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
@ -716,7 +716,7 @@ inline void BindVRadiusTexture(cuda_shared_data* sdata)
#if V_PRECISION == 1
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
@ -724,7 +724,7 @@ inline void BindVRadiusTexture(cuda_shared_data* sdata)
#endif
}
static __device__ inline V_FLOAT4 fetchVRadius(int i)
static __device__ inline V_CFLOAT4 fetchVRadius(int i)
{
#ifdef CUDA_USE_TEXTURE
#if V_PRECISION == 1
@ -747,7 +747,7 @@ inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
#if V_PRECISION == 1
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
@ -755,7 +755,7 @@ inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
#endif
}
static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
static __device__ inline V_CFLOAT4 fetchOmegaRmass(int i)
{
#ifdef CUDA_USE_TEXTURE
#if V_PRECISION == 1
@ -775,11 +775,11 @@ static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
return __hiloint2double(v.y, v.x);
}
static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
static __device__ inline F_CFLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
{
int4 v = tex1Dfetch(t, 2 * i);
int4 u = tex1Dfetch(t, 2 * i + 1);
F_FLOAT4 w;
F_CFLOAT4 w;
w.x = __hiloint2double(v.y, v.x);
w.y = __hiloint2double(v.w, v.z);
@ -799,7 +799,7 @@ inline void BindQTexture(cuda_shared_data* sdata)
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
@ -807,7 +807,7 @@ inline void BindQTexture(cuda_shared_data* sdata)
#endif
}
static __device__ inline F_FLOAT fetchQ(int i)
static __device__ inline F_CFLOAT fetchQ(int i)
{
#ifdef CUDA_USE_TEXTURE
#if F_PRECISION == 1
@ -835,7 +835,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_CFLOAT4));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
@ -843,7 +843,7 @@ inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
#endif
}
static __device__ inline X_FLOAT4 fetchXType(int i)
static __device__ inline X_CFLOAT4 fetchXType(int i)
{
#ifdef CUDA_USE_TEXTURE
#if X_PRECISION == 1
@ -863,7 +863,7 @@ static inline __device__ int sbmask(int j)
return j >> SBBITS & 3;
}
static inline __device__ void minimum_image(X_FLOAT4 &delta)
static inline __device__ void minimum_image(X_CFLOAT4 &delta)
{
if(_triclinic == 0) {
if(_periodicity[0]) {
@ -907,7 +907,7 @@ static inline __device__ void minimum_image(X_FLOAT4 &delta)
}
}
static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
static inline __device__ void closest_image(X_CFLOAT4 &x1, X_CFLOAT4 &x2, X_CFLOAT4 &ci)
{
ci.x = x2.x - x1.x;
ci.y = x2.y - x1.y;

View File

@ -4,12 +4,12 @@
void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
{
sdata->compile_settings.prec_glob = sizeof(CUDA_FLOAT) / 4;
sdata->compile_settings.prec_x = sizeof(X_FLOAT) / 4;
sdata->compile_settings.prec_v = sizeof(V_FLOAT) / 4;
sdata->compile_settings.prec_f = sizeof(F_FLOAT) / 4;
sdata->compile_settings.prec_pppm = sizeof(PPPM_FLOAT) / 4;
sdata->compile_settings.prec_fft = sizeof(FFT_FLOAT) / 4;
sdata->compile_settings.prec_glob = sizeof(CUDA_CFLOAT) / 4;
sdata->compile_settings.prec_x = sizeof(X_CFLOAT) / 4;
sdata->compile_settings.prec_v = sizeof(V_CFLOAT) / 4;
sdata->compile_settings.prec_f = sizeof(F_CFLOAT) / 4;
sdata->compile_settings.prec_pppm = sizeof(PPPM_CFLOAT) / 4;
sdata->compile_settings.prec_fft = sizeof(FFT_CFLOAT) / 4;
#ifdef FFT_CUFFT
sdata->compile_settings.cufft = 1;

View File

@ -60,7 +60,7 @@
//#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
//#define &MY_AP(var) &(MY_AP(var))
#define CUDA_USE_TEXTURE
#define CUDA_USE_FLOAT4
#define CUDA_USE_CFLOAT4
//constants used by many classes
@ -79,20 +79,20 @@
#define _h MY_AP(h)
#define _h_inv MY_AP(h_inv)
#define _h_rate MY_AP(h_rate)
__device__ __constant__ X_FLOAT _boxhi[3];
__device__ __constant__ X_FLOAT _boxlo[3];
__device__ __constant__ X_FLOAT _subhi[3];
__device__ __constant__ X_FLOAT _sublo[3];
__device__ __constant__ X_FLOAT _box_size[3];
__device__ __constant__ X_FLOAT _prd[3];
__device__ __constant__ X_CFLOAT _boxhi[3];
__device__ __constant__ X_CFLOAT _boxlo[3];
__device__ __constant__ X_CFLOAT _subhi[3];
__device__ __constant__ X_CFLOAT _sublo[3];
__device__ __constant__ X_CFLOAT _box_size[3];
__device__ __constant__ X_CFLOAT _prd[3];
__device__ __constant__ int _periodicity[3];
__device__ __constant__ int _triclinic;
__device__ __constant__ X_FLOAT _boxhi_lamda[3];
__device__ __constant__ X_FLOAT _boxlo_lamda[3];
__device__ __constant__ X_FLOAT _prd_lamda[3];
__device__ __constant__ X_FLOAT _h[6];
__device__ __constant__ X_FLOAT _h_inv[6];
__device__ __constant__ V_FLOAT _h_rate[6];
__device__ __constant__ X_CFLOAT _boxhi_lamda[3];
__device__ __constant__ X_CFLOAT _boxlo_lamda[3];
__device__ __constant__ X_CFLOAT _prd_lamda[3];
__device__ __constant__ X_CFLOAT _h[6];
__device__ __constant__ X_CFLOAT _h_inv[6];
__device__ __constant__ V_CFLOAT _h_rate[6];
//atom properties
@ -123,31 +123,31 @@ __device__ __constant__ V_FLOAT _h_rate[6];
#define _omega_rmass MY_AP(omega_rmass)
#define _freeze_group_bit MY_AP(freeze_group_bit)
#define _map_array MY_AP(map_array)
__device__ __constant__ X_FLOAT* _x; //holds pointer to positions
__device__ __constant__ V_FLOAT* _v;
__device__ __constant__ F_FLOAT* _f;
__device__ __constant__ X_CFLOAT* _x; //holds pointer to positions
__device__ __constant__ V_CFLOAT* _v;
__device__ __constant__ F_CFLOAT* _f;
__device__ __constant__ int* _tag;
__device__ __constant__ int* _type;
__device__ __constant__ int* _mask;
__device__ __constant__ int* _image;
__device__ __constant__ V_FLOAT* _mass;
__device__ __constant__ F_FLOAT* _q;
__device__ __constant__ V_FLOAT* _rmass;
__device__ __constant__ V_CFLOAT* _mass;
__device__ __constant__ F_CFLOAT* _q;
__device__ __constant__ V_CFLOAT* _rmass;
__device__ __constant__ int _rmass_flag;
__device__ __constant__ ENERGY_FLOAT* _eatom;
__device__ __constant__ ENERGY_FLOAT* _vatom;
__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions
__device__ __constant__ X_FLOAT* _radius;
__device__ __constant__ F_FLOAT* _density;
__device__ __constant__ V_FLOAT* _omega;
__device__ __constant__ F_FLOAT* _torque;
__device__ __constant__ ENERGY_CFLOAT* _eatom;
__device__ __constant__ ENERGY_CFLOAT* _vatom;
__device__ __constant__ X_CFLOAT4* _x_type; //holds pointer to positions
__device__ __constant__ X_CFLOAT* _radius;
__device__ __constant__ F_CFLOAT* _density;
__device__ __constant__ V_CFLOAT* _omega;
__device__ __constant__ F_CFLOAT* _torque;
__device__ __constant__ int* _special;
__device__ __constant__ int _maxspecial;
__device__ __constant__ int* _nspecial;
__device__ __constant__ int _special_flag[4];
__device__ __constant__ int* _molecule;
__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions
__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions
__device__ __constant__ V_CFLOAT4* _v_radius; //holds pointer to positions
__device__ __constant__ V_CFLOAT4* _omega_rmass; //holds pointer to positions
__device__ __constant__ int _freeze_group_bit;
__device__ __constant__ int* _map_array;
@ -226,8 +226,8 @@ __device__ __constant__ int* _neighbors;
__device__ __constant__ int* _neighbors_border;
__device__ __constant__ int* _neighbors_inner;
__device__ __constant__ int* _reneigh_flag;
__device__ __constant__ X_FLOAT _triggerneighsq;
__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions
__device__ __constant__ X_CFLOAT _triggerneighsq;
__device__ __constant__ X_CFLOAT* _xhold; //holds pointer to positions
__device__ __constant__ int _maxhold;
__device__ __constant__ int _dist_check;
__device__ __constant__ int _neighbor_maxlocal;
@ -253,12 +253,12 @@ __device__ __constant__ unsigned _nghost;
__device__ __constant__ unsigned _nlocal;
__device__ __constant__ unsigned _nmax;
__device__ __constant__ unsigned _cuda_ntypes;
__device__ __constant__ V_FLOAT _dtf;
__device__ __constant__ X_FLOAT _dtv;
__device__ __constant__ V_FLOAT _factor;
__device__ __constant__ ENERGY_FLOAT* _virial;
__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
__device__ __constant__ ENERGY_FLOAT* _eng_coul;
__device__ __constant__ V_CFLOAT _dtf;
__device__ __constant__ X_CFLOAT _dtv;
__device__ __constant__ V_CFLOAT _factor;
__device__ __constant__ ENERGY_CFLOAT* _virial;
__device__ __constant__ ENERGY_CFLOAT* _eng_vdwl;
__device__ __constant__ ENERGY_CFLOAT* _eng_coul;
__device__ __constant__ int _molecular;
//other general constants

View File

@ -55,30 +55,30 @@ enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_L
#define _cutsq_global MY_AP(cutsq_global)
#define _collect_forces_later MY_AP(collect_forces_later)
__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2];
__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2];
__device__ __constant__ F_FLOAT _special_lj[4];
__device__ __constant__ F_FLOAT _special_coul[4];
__device__ __constant__ X_FLOAT _cutsq_global;
__device__ __constant__ X_CFLOAT _cutsq[CUDA_MAX_TYPES2];
__device__ __constant__ ENERGY_CFLOAT _offset[CUDA_MAX_TYPES2];
__device__ __constant__ F_CFLOAT _special_lj[4];
__device__ __constant__ F_CFLOAT _special_coul[4];
__device__ __constant__ X_CFLOAT _cutsq_global;
__device__ __constant__ int _collect_forces_later;
__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space)
__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2];
__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2];
__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2];
__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2];
__device__ __constant__ F_CFLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space)
__device__ __constant__ F_CFLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2];
__device__ __constant__ F_CFLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2];
__device__ __constant__ F_CFLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2];
__device__ __constant__ F_CFLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2];
__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space)
__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm);
__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space)
__device__ __constant__ F_CFLOAT* MY_AP(coeff2_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff3_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff4_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff5_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff6_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff7_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff8_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff9_gm);
__device__ __constant__ F_CFLOAT* MY_AP(coeff10_gm);
#define _coeff1_gm_tex MY_AP(coeff1_gm_tex)
#if F_PRECISION == 1
@ -159,17 +159,17 @@ texture<int2, 1> _coeff10_gm_tex;
#define _g_ewald MY_AP(g_ewald)
#define _qqrd2e MY_AP(qqrd2e)
#define _kappa MY_AP(kappa)
__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2];
__device__ __constant__ X_FLOAT _cut_coulsq_global;
__device__ __constant__ F_FLOAT _g_ewald;
__device__ __constant__ F_FLOAT _qqrd2e;
__device__ __constant__ F_FLOAT _kappa;
__device__ __constant__ X_CFLOAT _cut_coulsq[CUDA_MAX_TYPES2];
__device__ __constant__ X_CFLOAT _cut_coulsq_global;
__device__ __constant__ F_CFLOAT _g_ewald;
__device__ __constant__ F_CFLOAT _qqrd2e;
__device__ __constant__ F_CFLOAT _kappa;
//inner cutoff
#define _cut_innersq MY_AP(cut_innersq)
#define _cut_innersq_global MY_AP(cut_innersq_global)
__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2];
__device__ __constant__ X_FLOAT _cut_innersq_global;
__device__ __constant__ X_CFLOAT _cut_innersq[CUDA_MAX_TYPES2];
__device__ __constant__ X_CFLOAT _cut_innersq_global;
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
@ -241,14 +241,14 @@ void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighli
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
//Atom
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
//Other
@ -261,8 +261,8 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
{
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2;
unsigned n = sizeof(F_CFLOAT) * cuda_ntypes2;
unsigned nx = sizeof(X_CFLOAT) * cuda_ntypes2;
//check if enough constant memory is available
if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
@ -275,24 +275,24 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
//type conversion of cutoffs and parameters
if(need_cut) {
X_FLOAT cutsq[cuda_ntypes2];
X_CFLOAT cutsq[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
}
}
int cutsqdiffer = 0;
X_FLOAT cutsq_global;
cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
X_CFLOAT cutsq_global;
cutsq_global = (X_CFLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
if(sdata->pair.cut) {
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = i; j <= sdata->atom.ntypes; ++j) {
if(sdata->pair.cut[i][j] > 1e-6) {
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
cutsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
}
if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
@ -307,8 +307,8 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = i; j <= sdata->atom.ntypes; ++j) {
if(sdata->pair.cut[i][j] > 1e-6) {
cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
cutsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cutsq[i][j]);
cutsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cutsq[i][j]);
}
if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
@ -326,28 +326,28 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx);
}
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_CFLOAT));
}
if(need_innercut) {
X_FLOAT cut_innersq[cuda_ntypes2];
X_CFLOAT cut_innersq[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
cut_innersq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
}
}
int cutsqdiffer = 0;
X_FLOAT cut_innersq_global;
cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
X_CFLOAT cut_innersq_global;
cut_innersq_global = (X_CFLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
if(sdata->pair.cut_inner) {
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = i; j <= sdata->atom.ntypes; ++j) {
if(sdata->pair.cut_inner[i][j] > 1e-6) {
cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
cut_innersq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
cut_innersq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
}
if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
@ -363,30 +363,30 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx);
}
cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_CFLOAT));
}
if(need_q) {
X_FLOAT cut_coulsq[cuda_ntypes2];
X_CFLOAT cut_coulsq[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
cut_coulsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
}
}
int cutsqdiffer = 0;
X_FLOAT cut_coulsq_global;
cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
X_CFLOAT cut_coulsq_global;
cut_coulsq_global = (X_CFLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global;
if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_CFLOAT) sdata->pair.cut_coulsq_global;
if(sdata->pair.cut_coul) {
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = i; j <= sdata->atom.ntypes; ++j) {
if(sdata->pair.cut_coul[i][j] > 1e-6) {
cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
cut_coulsq[i * cuda_ntypes + j] = (X_CFLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
cut_coulsq[j * cuda_ntypes + i] = (X_CFLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
}
if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
@ -402,22 +402,22 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx);
}
cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_CFLOAT));
}
CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
if(ncoeff > 0) {
F_FLOAT coeff1[cuda_ntypes2];
F_CFLOAT coeff1[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
coeff1[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff1[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice);
_coeff1_gm_tex.normalized = false; // access with normalized texture coordinates
@ -429,7 +429,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed");
cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed");
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
@ -445,16 +445,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
if(ncoeff > 1) {
F_FLOAT coeff2[cuda_ntypes2];
F_CFLOAT coeff2[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
coeff2[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff2[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice);
_coeff2_gm_tex.normalized = false; // access with normalized texture coordinates
@ -464,7 +464,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -477,16 +477,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
if(ncoeff > 2) {
F_FLOAT coeff3[cuda_ntypes2];
F_CFLOAT coeff3[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
coeff3[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff3[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice);
_coeff3_gm_tex.normalized = false; // access with normalized texture coordinates
_coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
@ -495,7 +495,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -507,16 +507,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
if(ncoeff > 3) {
F_FLOAT coeff4[cuda_ntypes2];
F_CFLOAT coeff4[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
coeff4[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff4[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice);
_coeff4_gm_tex.normalized = false; // access with normalized texture coordinates
_coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
@ -525,7 +525,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -537,16 +537,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
if(ncoeff > 4) {
F_FLOAT coeff5[cuda_ntypes2];
F_CFLOAT coeff5[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
coeff5[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff5[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice);
_coeff5_gm_tex.normalized = false; // access with normalized texture coordinates
_coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
@ -555,7 +555,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -567,16 +567,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
if(ncoeff > 5) {
F_FLOAT coeff6[cuda_ntypes2];
F_CFLOAT coeff6[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
coeff6[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff6[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice);
_coeff6_gm_tex.normalized = false; // access with normalized texture coordinates
_coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
@ -585,7 +585,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -596,16 +596,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
if(ncoeff > 6) {
F_FLOAT coeff7[cuda_ntypes2];
F_CFLOAT coeff7[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
coeff7[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff7[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice);
_coeff7_gm_tex.normalized = false; // access with normalized texture coordinates
_coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
@ -614,7 +614,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -625,16 +625,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
if(ncoeff > 7) {
F_FLOAT coeff8[cuda_ntypes2];
F_CFLOAT coeff8[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
coeff8[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff8[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice);
_coeff8_gm_tex.normalized = false; // access with normalized texture coordinates
_coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
@ -643,7 +643,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -654,16 +654,16 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
if(ncoeff > 8) {
F_FLOAT coeff9[cuda_ntypes2];
F_CFLOAT coeff9[cuda_ntypes2];
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
coeff9[i * cuda_ntypes + j] = (F_CFLOAT) sdata->pair.coeff9[i][j];
}
}
if(use_global_params) {
cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_CFLOAT*));
cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice);
_coeff9_gm_tex.normalized = false; // access with normalized texture coordinates
_coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no
@ -672,7 +672,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
#if F_PRECISION == 1
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_CFLOAT));
#else
cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
@ -682,40 +682,40 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
F_FLOAT special_lj[4];
F_CFLOAT special_lj[4];
special_lj[0] = sdata->pair.special_lj[0];
special_lj[1] = sdata->pair.special_lj[1];
special_lj[2] = sdata->pair.special_lj[2];
special_lj[3] = sdata->pair.special_lj[3];
X_FLOAT box_size[3] = {
X_CFLOAT box_size[3] = {
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4);
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_CFLOAT) * 4);
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
if(need_q) {
F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
F_FLOAT special_coul[4];
F_CFLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
F_CFLOAT special_coul[4];
special_coul[0] = sdata->pair.special_coul[0];
special_coul[1] = sdata->pair.special_coul[1];
special_coul[2] = sdata->pair.special_coul[2];
special_coul[3] = sdata->pair.special_coul[3];
cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4);
cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_CFLOAT) * 4);
cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_CFLOAT*));
}
CUT_CHECK_ERROR("Cuda_Pair: init failed");
@ -763,7 +763,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
maxthreads = 64;
}
int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_CFLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
threads.x = layout.z;
threads.y = 1;
threads.z = 1;
@ -771,9 +771,9 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
grid.y = layout.y;
grid.z = 1;
int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT);
int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_CFLOAT);
if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT));
if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_CFLOAT));
Cuda_UpdateBuffer(sdata, size);
@ -787,7 +787,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
my_gettime(CLOCK_REALTIME, &startpairtime);
MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);)
}
//Function which is called after the kernel invocation, collects energy and virial
@ -810,8 +810,8 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sh
grid.y = 1;
dim3 threads(128, 1, 1);
MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);)
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed");
}
@ -863,15 +863,15 @@ void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
}
@ -999,7 +999,7 @@ void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag)
grid.y = 1;
threads.x = 128;
//printf("A grid.x: %i\n",grid.x);
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
}

View File

@ -32,12 +32,12 @@
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedECoul;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
ENERGY_CFLOAT* sharedE;
ENERGY_CFLOAT* sharedECoul;
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag || eflag_atom) {
sharedE = &sharedmem[threadIdx.x];
@ -62,12 +62,12 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_FLOAT xtmp, ytmp, ztmp;
X_FLOAT4 myxtype;
F_FLOAT fxtmp, fytmp, fztmp, fpair;
F_FLOAT delx, dely, delz;
F_FLOAT factor_lj, factor_coul;
F_FLOAT qtmp;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT qtmp;
int itype, i, j;
int jnum = 0;
int* jlist;
@ -114,7 +114,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
int jtype = static_cast <int>(myxtype.w);
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
@ -171,7 +171,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
}
if(coul_type != COUL_NONE) {
const F_FLOAT qiqj = qtmp * fetchQ(j);
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > 1e-8) {
const bool in_coul_cutoff =
@ -188,7 +188,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
break;
case COUL_CUT: {
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
@ -199,11 +199,11 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
break;
case COUL_DEBYE: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0) / r;
const F_FLOAT screening = _EXP_(-_kappa * r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
@ -219,14 +219,14 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
break;
case COUL_LONG: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij * grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
@ -248,7 +248,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
if(in_cutoff) {
F_FLOAT dxfp, dyfp, dzfp;
F_CFLOAT dxfp, dyfp, dzfp;
fxtmp += dxfp = delx * fpair;
fytmp += dyfp = dely * fpair;
fztmp += dzfp = delz * fpair;
@ -268,10 +268,10 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
__syncthreads();
if(ii < _inum) {
F_FLOAT* my_f;
F_CFLOAT* my_f;
if(_collect_forces_later) {
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
buffer = &buffer[1 * gridDim.x * gridDim.y];
@ -284,7 +284,7 @@ __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_FLOAT*) buffer;
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = fxtmp;
my_f += _nmax;
@ -337,14 +337,14 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
if(ii >= _inum)
return;
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
F_FLOAT3* sharedVirial1;
F_FLOAT3* sharedVirial2;
F_FLOAT* sharedEnergy;
F_FLOAT* sharedEnergyCoul;
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
F_CFLOAT3* sharedVirial1;
F_CFLOAT3* sharedVirial2;
F_CFLOAT* sharedEnergy;
F_CFLOAT* sharedEnergyCoul;
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];
if(vflag) {
sharedVirial1 = &sharedForce[64];
@ -356,25 +356,25 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
if(eflag) {
if(vflag || vflag_atom)
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
else
sharedEnergy = (F_FLOAT*) &sharedForce[64];
sharedEnergy = (F_CFLOAT*) &sharedForce[64];
if(coul_type != COUL_NONE)
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];
}
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
X_FLOAT xtmp, ytmp, ztmp;
X_FLOAT4 myxtype;
F_FLOAT delx, dely, delz;
F_FLOAT factor_lj, factor_coul;
F_FLOAT fpair;
F_FLOAT qtmp;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT fpair;
F_CFLOAT qtmp;
int itype, jnum, i, j;
int* jlist;
@ -413,7 +413,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
bool in_coul_cutoff;
@ -471,7 +471,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
}
if(coul_type != COUL_NONE) {
const F_FLOAT qiqj = qtmp * fetchQ(j);
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > (1e-8f)) {
in_coul_cutoff =
@ -492,14 +492,14 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
break;
case COUL_LONG: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij * grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
@ -514,11 +514,11 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
break;
case COUL_DEBYE: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0) / r;
const F_FLOAT screening = _EXP_(-_kappa * r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
@ -530,7 +530,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
break;
case COUL_CUT: {
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
@ -549,7 +549,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
if(in_cutoff || in_coul_cutoff) {
F_FLOAT dxfp, dyfp, dzfp;
F_CFLOAT dxfp, dyfp, dzfp;
partialForce.x += dxfp = delx * fpair;
partialForce.y += dyfp = dely * fpair;
partialForce.z += dzfp = delz * fpair;
@ -613,10 +613,10 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
if(threadIdx.x == 0) {
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
ENERGY_FLOAT tmp_evdwl;
ENERGY_CFLOAT tmp_evdwl;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
if(eflag_atom)
@ -635,7 +635,7 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
}
if(vflag) {
ENERGY_FLOAT tmp;
ENERGY_CFLOAT tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
@ -663,10 +663,10 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
F_FLOAT* my_f;
F_CFLOAT* my_f;
if(_collect_forces_later) {
my_f = (F_FLOAT*) buffer;
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = sharedForce[0].x;
my_f += _nmax;
@ -688,12 +688,12 @@ __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedECoul;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
ENERGY_CFLOAT* sharedE;
ENERGY_CFLOAT* sharedECoul;
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag || eflag_atom) {
sharedE = &sharedmem[threadIdx.x];
@ -718,12 +718,12 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_FLOAT xtmp, ytmp, ztmp;
X_FLOAT4 myxtype;
F_FLOAT fxtmp, fytmp, fztmp, fpair;
F_FLOAT delx, dely, delz;
F_FLOAT factor_lj, factor_coul;
F_FLOAT qtmp;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT qtmp;
int itype, i, j;
int jnum = 0;
int* jlist;
@ -774,7 +774,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
int jtype = static_cast <int>(myxtype.w);
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
@ -831,7 +831,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
}
if(coul_type != COUL_NONE) {
const F_FLOAT qiqj = qtmp * fetchQ(j);
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > 1e-8) {
const bool in_coul_cutoff =
@ -848,7 +848,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
break;
case COUL_CUT: {
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
@ -859,11 +859,11 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
break;
case COUL_DEBYE: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0) / r;
const F_FLOAT screening = _EXP_(-_kappa * r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
@ -879,14 +879,14 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
break;
case COUL_LONG: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij * grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
@ -909,7 +909,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
if(in_cutoff) {
F_FLOAT dxfp, dyfp, dzfp;
F_CFLOAT dxfp, dyfp, dzfp;
fxtmp += dxfp = delx * fpair;
fytmp += dyfp = dely * fpair;
fztmp += dzfp = delz * fpair;
@ -929,10 +929,10 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
__syncthreads();
if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
F_FLOAT* my_f;
F_CFLOAT* my_f;
if(_collect_forces_later) {
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
buffer = &buffer[1 * gridDim.x * gridDim.y];
@ -945,7 +945,7 @@ __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vf
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_FLOAT*) buffer;
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = fxtmp;
my_f += _nmax;
@ -998,14 +998,14 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
return;
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
F_FLOAT3* sharedVirial1;
F_FLOAT3* sharedVirial2;
F_FLOAT* sharedEnergy;
F_FLOAT* sharedEnergyCoul;
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
F_CFLOAT3* sharedVirial1;
F_CFLOAT3* sharedVirial2;
F_CFLOAT* sharedEnergy;
F_CFLOAT* sharedEnergyCoul;
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];
if(vflag) {
sharedVirial1 = &sharedForce[64];
@ -1017,25 +1017,25 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
if(eflag) {
if(vflag || vflag_atom)
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
else
sharedEnergy = (F_FLOAT*) &sharedForce[64];
sharedEnergy = (F_CFLOAT*) &sharedForce[64];
if(coul_type != COUL_NONE)
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];
}
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
X_FLOAT xtmp, ytmp, ztmp;
X_FLOAT4 myxtype;
F_FLOAT delx, dely, delz;
F_FLOAT factor_lj, factor_coul;
F_FLOAT fpair;
F_FLOAT qtmp;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT fpair;
F_CFLOAT qtmp;
int itype, jnum, i, j;
int* jlist;
@ -1074,7 +1074,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
bool in_coul_cutoff;
@ -1132,7 +1132,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
}
if(coul_type != COUL_NONE) {
const F_FLOAT qiqj = qtmp * fetchQ(j);
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > (1e-8f)) {
in_coul_cutoff =
@ -1153,14 +1153,14 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
break;
case COUL_LONG: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij * grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
@ -1175,11 +1175,11 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
break;
case COUL_DEBYE: {
const F_FLOAT r2inv = F_F(1.0) / rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0) / r;
const F_FLOAT screening = _EXP_(-_kappa * r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
@ -1191,7 +1191,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
break;
case COUL_CUT: {
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
@ -1210,7 +1210,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
if(in_cutoff || in_coul_cutoff) {
F_FLOAT dxfp, dyfp, dzfp;
F_CFLOAT dxfp, dyfp, dzfp;
partialForce.x += dxfp = delx * fpair;
partialForce.y += dyfp = dely * fpair;
partialForce.z += dzfp = delz * fpair;
@ -1274,10 +1274,10 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
if(threadIdx.x == 0) {
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
ENERGY_FLOAT tmp_evdwl;
ENERGY_CFLOAT tmp_evdwl;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
if(eflag_atom)
@ -1296,7 +1296,7 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
}
if(vflag) {
ENERGY_FLOAT tmp;
ENERGY_CFLOAT tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
@ -1324,10 +1324,10 @@ __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vf
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
F_FLOAT* my_f;
F_CFLOAT* my_f;
if(_collect_forces_later) {
my_f = (F_FLOAT*) buffer;
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = sharedForce[0].x;
my_f += _nmax;
@ -1350,7 +1350,7 @@ __global__ void Pair_GenerateXType_Kernel()
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
X_FLOAT4 xtype;
X_CFLOAT4 xtype;
xtype.x = _x[i];
xtype.y = _x[i + _nmax];
xtype.z = _x[i + 2 * _nmax];
@ -1365,7 +1365,7 @@ __global__ void Pair_GenerateVRadius_Kernel()
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
V_FLOAT4 vradius;
V_CFLOAT4 vradius;
vradius.x = _v[i];
vradius.y = _v[i + _nmax];
vradius.z = _v[i + 2 * _nmax];
@ -1379,7 +1379,7 @@ __global__ void Pair_GenerateOmegaRmass_Kernel()
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
V_FLOAT4 omegarmass;
V_CFLOAT4 omegarmass;
omegarmass.x = _omega[i];
omegarmass.y = _omega[i + _nmax];
omegarmass.z = _omega[i + 2 * _nmax];
@ -1393,7 +1393,7 @@ __global__ void Pair_RevertXType_Kernel()
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
X_FLOAT4 xtype = _x_type[i];
X_CFLOAT4 xtype = _x_type[i];
_x[i] = xtype.x;
_x[i + _nmax] = xtype.y;
_x[i + 2 * _nmax] = xtype.z;
@ -1407,7 +1407,7 @@ __global__ void Pair_BuildXHold_Kernel()
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
X_FLOAT4 xtype = _x_type[i];
X_CFLOAT4 xtype = _x_type[i];
_xhold[i] = xtype.x;
_xhold[i + _nmax] = xtype.y;
_xhold[i + 2 * _nmax] = xtype.z;
@ -1421,10 +1421,10 @@ __global__ void Pair_CollectForces_Kernel(int nperblock, int n)
if(i >= _nlocal) return;
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
F_FLOAT* my_f = _f + i;
F_CFLOAT* buf_f = (F_CFLOAT*) &buf[nperblock * n];
F_CFLOAT* my_f = _f + i;
buf_f += i;
*my_f += * buf_f;
my_f += _nmax;

View File

@ -21,12 +21,12 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ ENERGY_FLOAT sharedmem[];
extern __shared__ ENERGY_CFLOAT sharedmem[];
static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0)
{
__syncthreads();
ENERGY_FLOAT* shared = sharedmem;
ENERGY_CFLOAT* shared = sharedmem;
if(eflag) {
reduceBlock(shared);
@ -49,7 +49,7 @@ static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, i
if(threadIdx.x == 0) {
shared = sharedmem;
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
@ -79,8 +79,8 @@ static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, i
__global__ void MY_AP(PairVirialCompute_reduce)(int n)
{
sharedmem[threadIdx.x] = ENERGY_F(0.0);
ENERGY_FLOAT sum = ENERGY_F(0.0);
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT sum = ENERGY_F(0.0);
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
//if(blockIdx.x==2) buf=&buf[n];

View File

@ -25,7 +25,7 @@
#define CUDA_PRECISION_H_
/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
* Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
* ***_FLOAT: type definition of given property
* ***_CFLOAT: type definition of given property
* ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
*/
@ -39,17 +39,17 @@
#ifdef CUDA_PRECISION
#if CUDA_PRECISION == 1
#define CUDA_FLOAT float
#define CUDA_CFLOAT float
#define CUDA_F(x) x##f
#endif
#if CUDA_PRECISION == 2
#define CUDA_FLOAT double
#define CUDA_CFLOAT double
#define CUDA_F(x) x
#endif
#endif
#ifndef CUDA_PRECISION
#define CUDA_FLOAT double
#define CUDA_CFLOAT double
#define CUDA_F(x) x
#define CUDA_PRECISION 2
#endif
@ -59,17 +59,17 @@
#ifdef FFT_PRECISION_CU
#if FFT_PRECISION_CU == 1
#define FFT_FLOAT float
#define FFT_CFLOAT float
#define FFT_F(x) x##f
#endif
#if FFT_PRECISION_CU == 2
#define FFT_FLOAT double
#define FFT_CFLOAT double
#define FFT_F(x) x
#endif
#endif
#ifndef FFT_PRECISION_CU
#define FFT_FLOAT CUDA_FLOAT
#define FFT_CFLOAT CUDA_CFLOAT
#define FFT_F(x) CUDA_F(x)
#define FFT_PRECISION_CU CUDA_PRECISION
#endif
@ -84,24 +84,24 @@
#ifdef PPPM_PRECISION
#if PPPM_PRECISION == 1
#define PPPM_FLOAT float
#define PPPM_CFLOAT float
#ifdef float3
#define PPPM_FLOAT3 float3
#define PPPM_CFLOAT3 float3
#else
struct PPPM_FLOAT3 {
PPPM_FLOAT x;
PPPM_FLOAT y;
PPPM_FLOAT z;
struct PPPM_CFLOAT3 {
PPPM_CFLOAT x;
PPPM_CFLOAT y;
PPPM_CFLOAT z;
};
#endif
#define PPPM_F(x) x##f
#endif
#if PPPM_PRECISION == 2
#define PPPM_FLOAT double
struct PPPM_FLOAT3 {
PPPM_FLOAT x;
PPPM_FLOAT y;
PPPM_FLOAT z;
#define PPPM_CFLOAT double
struct PPPM_CFLOAT3 {
PPPM_CFLOAT x;
PPPM_CFLOAT y;
PPPM_CFLOAT z;
};
#define PPPM_F(x) x
#endif
@ -115,17 +115,17 @@ struct PPPM_FLOAT3 {
#ifdef F_PRECISION
#if F_PRECISION == 1
#define F_FLOAT float
#define F_CFLOAT float
#define F_F(x) x##f
#endif
#if F_PRECISION == 2
#define F_FLOAT double
#define F_CFLOAT double
#define F_F(x) x
#endif
#endif
#ifndef F_PRECISION
#define F_FLOAT CUDA_FLOAT
#define F_CFLOAT CUDA_CFLOAT
#define F_F(x) CUDA_F(x)
#define F_PRECISION CUDA_PRECISION
#endif
@ -141,48 +141,49 @@ struct PPPM_FLOAT3 {
#endif
#if F_PRECISION == 2
struct F_FLOAT2 {
F_FLOAT x;
F_FLOAT y;
struct F_CFLOAT2 {
F_CFLOAT x;
F_CFLOAT y;
};
struct F_FLOAT3 {
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
struct F_CFLOAT3 {
F_CFLOAT x;
F_CFLOAT y;
F_CFLOAT z;
};
struct F_FLOAT4 {
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
F_FLOAT w;
struct F_CFLOAT4 {
F_CFLOAT x;
F_CFLOAT y;
F_CFLOAT z;
F_CFLOAT w;
};
#else
#define F_FLOAT2 float2
#define F_FLOAT3 float3
#define F_FLOAT4 float4
#define F_CFLOAT2 float2
#define F_CFLOAT3 float3
#define F_CFLOAT4 float4
#endif
//--------------------------------
//-----------ENERGY-----------------
//--------------------------------
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_CFLOAT CUDA_CFLOAT
#define ENERGY_F(x) CUDA_F(x)
#endif
#ifdef ENERGY_PRECISION
#if ENERGY_PRECISION == 1
#define ENERGY_FLOAT float
#define ENERGY_CFLOAT float
#define ENERGY_F(x) x##f
#endif
#if ENERGY_PRECISION == 2
#define ENERGY_FLOAT double
#define ENERGY_CFLOAT double
#define ENERGY_F(x) x
#endif
#endif
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_CFLOAT CUDA_CFLOAT
#define ENERGY_F(x) CUDA_F(x)
#define ENERGY_PRECISION CUDA_PRECISION
#endif
@ -193,41 +194,41 @@ struct F_FLOAT4 {
#ifdef X_PRECISION
#if X_PRECISION == 1
#define X_FLOAT float
#define X_CFLOAT float
#define X_F(x) x##f
#endif
#if X_PRECISION == 2
#define X_FLOAT double
#define X_CFLOAT double
#define X_F(x) x
#endif
#endif
#ifndef X_PRECISION
#define X_FLOAT CUDA_FLOAT
#define X_CFLOAT CUDA_CFLOAT
#define X_F(x) CUDA_F(x)
#define X_PRECISION CUDA_PRECISION
#endif
#if X_PRECISION == 2
struct X_FLOAT2 {
X_FLOAT x;
X_FLOAT y;
struct X_CFLOAT2 {
X_CFLOAT x;
X_CFLOAT y;
};
struct X_FLOAT3 {
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
struct X_CFLOAT3 {
X_CFLOAT x;
X_CFLOAT y;
X_CFLOAT z;
};
struct X_FLOAT4 {
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
X_FLOAT w;
struct X_CFLOAT4 {
X_CFLOAT x;
X_CFLOAT y;
X_CFLOAT z;
X_CFLOAT w;
};
#else
#define X_FLOAT2 float2
#define X_FLOAT3 float3
#define X_FLOAT4 float4
#define X_CFLOAT2 float2
#define X_CFLOAT3 float3
#define X_CFLOAT4 float4
#endif
//--------------------------------
@ -236,30 +237,30 @@ struct X_FLOAT4 {
#ifdef V_PRECISION
#if V_PRECISION == 1
#define V_FLOAT float
#define V_CFLOAT float
#define V_F(x) x##f
#endif
#if V_PRECISION == 2
#define V_FLOAT double
#define V_CFLOAT double
#define V_F(x) x
#endif
#endif
#ifndef V_PRECISION
#define V_FLOAT CUDA_FLOAT
#define V_CFLOAT CUDA_CFLOAT
#define V_F(x) CUDA_F(x)
#define V_PRECISION CUDA_PRECISION
#endif
#if V_PRECISION == 2
struct V_FLOAT4 {
V_FLOAT x;
V_FLOAT y;
V_FLOAT z;
V_FLOAT w;
struct V_CFLOAT4 {
V_CFLOAT x;
V_CFLOAT y;
V_CFLOAT z;
V_CFLOAT w;
};
#else
#define V_FLOAT4 float4
#define V_CFLOAT4 float4
#endif
#ifdef NO_PREC_TIMING

View File

@ -61,9 +61,9 @@ struct cuda_shared_atom { // relevent data from atom class
int need_eatom;
int need_vatom;
dev_array x_type; // position + type in X_FLOAT4 struct
dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
dev_array x_type; // position + type in X_CFLOAT4 struct
dev_array v_radius; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
dev_array omega_rmass; // velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
double* mass_host; // remember per-type host pointer to masses
//int natoms; // total # of atoms in system, could be 0
@ -82,7 +82,7 @@ struct cuda_shared_atom { // relevent data from atom class
int update_neigh;
dev_array xhold; // position at last neighboring
X_FLOAT triggerneighsq; // maximum square movement before reneighboring
X_CFLOAT triggerneighsq; // maximum square movement before reneighboring
int reneigh_flag; // is reneighboring necessary
int maxhold; // size of xhold
int dist_check; //perform distance check for reneighboring
@ -96,9 +96,9 @@ struct cuda_shared_atom { // relevent data from atom class
struct cuda_shared_pair { // relevent data from pair class
char cudable_force; // check for (cudable_force!=0)
X_FLOAT cut_global;
X_FLOAT cut_inner_global;
X_FLOAT cut_coul_global;
X_CFLOAT cut_global;
X_CFLOAT cut_inner_global;
X_CFLOAT cut_coul_global;
double** cut; // type-type cutoff
double** cutsq; // type-type cutoff
double** cut_inner; // type-type cutoff for coul
@ -116,11 +116,11 @@ struct cuda_shared_pair { // relevent data from pair class
double** offset;
double* special_lj;
double* special_coul;
dev_array virial; // ENERGY_FLOAT
dev_array eng_vdwl; // ENERGY_FLOAT
dev_array eng_coul; // ENERGY_FLOAT
X_FLOAT cut_coulsq_global;
F_FLOAT g_ewald, kappa;
dev_array virial; // ENERGY_CFLOAT
dev_array eng_vdwl; // ENERGY_CFLOAT
dev_array eng_coul; // ENERGY_CFLOAT
X_CFLOAT cut_coulsq_global;
F_CFLOAT g_ewald, kappa;
int freeze_group_bit;
dev_array coeff1_gm;
@ -144,48 +144,48 @@ struct cuda_shared_pair { // relevent data from pair class
};
struct cuda_shared_domain { // relevent data from domain class
X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
X_FLOAT subhi[3];
X_FLOAT boxlo[3];
X_FLOAT boxhi[3];
X_FLOAT prd[3];
X_CFLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
X_CFLOAT subhi[3];
X_CFLOAT boxlo[3];
X_CFLOAT boxhi[3];
X_CFLOAT prd[3];
int periodicity[3]; // xyz periodicity as array
int triclinic;
X_FLOAT xy;
X_FLOAT xz;
X_FLOAT yz;
X_FLOAT boxlo_lamda[3];
X_FLOAT boxhi_lamda[3];
X_FLOAT prd_lamda[3];
X_FLOAT h[6];
X_FLOAT h_inv[6];
V_FLOAT h_rate[6];
X_CFLOAT xy;
X_CFLOAT xz;
X_CFLOAT yz;
X_CFLOAT boxlo_lamda[3];
X_CFLOAT boxhi_lamda[3];
X_CFLOAT prd_lamda[3];
X_CFLOAT h[6];
X_CFLOAT h_inv[6];
V_CFLOAT h_rate[6];
int update;
};
struct cuda_shared_pppm {
char cudable_force;
#ifdef FFT_CUFFT
FFT_FLOAT* work1;
FFT_FLOAT* work2;
FFT_FLOAT* work3;
PPPM_FLOAT* greensfn;
PPPM_FLOAT* fkx;
PPPM_FLOAT* fky;
PPPM_FLOAT* fkz;
PPPM_FLOAT* vg;
FFT_CFLOAT* work1;
FFT_CFLOAT* work2;
FFT_CFLOAT* work3;
PPPM_CFLOAT* greensfn;
PPPM_CFLOAT* fkx;
PPPM_CFLOAT* fky;
PPPM_CFLOAT* fkz;
PPPM_CFLOAT* vg;
#endif
int* part2grid;
PPPM_FLOAT* density_brick;
PPPM_CFLOAT* density_brick;
int* density_brick_int;
PPPM_FLOAT density_intScale;
PPPM_FLOAT* vdx_brick;
PPPM_FLOAT* vdy_brick;
PPPM_FLOAT* vdz_brick;
PPPM_FLOAT* density_fft;
ENERGY_FLOAT* energy;
ENERGY_FLOAT* virial;
PPPM_CFLOAT density_intScale;
PPPM_CFLOAT* vdx_brick;
PPPM_CFLOAT* vdy_brick;
PPPM_CFLOAT* vdz_brick;
PPPM_CFLOAT* density_fft;
ENERGY_CFLOAT* energy;
ENERGY_CFLOAT* virial;
int nxlo_in;
int nxhi_in;
int nxlo_out;
@ -201,20 +201,20 @@ struct cuda_shared_pppm {
int nx_pppm;
int ny_pppm;
int nz_pppm;
PPPM_FLOAT qqrd2e;
PPPM_CFLOAT qqrd2e;
int order;
// float3 sublo;
PPPM_FLOAT* rho_coeff;
PPPM_CFLOAT* rho_coeff;
int nmax;
int nlocal;
PPPM_FLOAT* debugdata;
PPPM_FLOAT delxinv;
PPPM_FLOAT delyinv;
PPPM_FLOAT delzinv;
PPPM_CFLOAT* debugdata;
PPPM_CFLOAT delxinv;
PPPM_CFLOAT delyinv;
PPPM_CFLOAT delzinv;
int nlower;
int nupper;
PPPM_FLOAT shiftone;
PPPM_FLOAT3* fH;
PPPM_CFLOAT shiftone;
PPPM_CFLOAT3* fH;
};
struct cuda_shared_comm {
@ -262,7 +262,7 @@ struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cu
int maxneighbors;
int neigh_lists_per_page;
double** cutneighsq;
CUDA_FLOAT* cu_cutneighsq;
CUDA_CFLOAT* cu_cutneighsq;
int* binned_id;
int* bin_dim;
int bin_nmax;

View File

@ -49,8 +49,8 @@ void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*));
@ -58,19 +58,19 @@ void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , 3 * sizeof(int));
cudaMemcpyToSymbol(MY_AP(triclinic) , & sdata->domain.triclinic , sizeof(int));
cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_FLOAT));
cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_CFLOAT));
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata , sizeof(int*));
}
@ -94,15 +94,15 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group
int sharedmem = 0;
if(box_change) sharedmem = 6 * sizeof(X_FLOAT);
if(box_change) sharedmem = 6 * sizeof(X_CFLOAT);
int3 layout = getgrid(sdata->atom.nlocal, sharedmem);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
sharedmem *= threads.x;
if((box_change) && (sdata->buffer_new or (6 * sizeof(X_FLOAT)*grid.x * grid.y > sdata->buffersize)))
Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_FLOAT));
if((box_change) && (sdata->buffer_new or (6 * sizeof(X_CFLOAT)*grid.x * grid.y > sdata->buffersize)))
Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_CFLOAT));
Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change);
@ -111,13 +111,13 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group
CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
if(box_change) {
X_FLOAT buf2[6 * layout.x * layout.y];
X_FLOAT* buf = buf2;
X_CFLOAT buf2[6 * layout.x * layout.y];
X_CFLOAT* buf = buf2;
int flag;
cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
//printf("Flag: %i\n",flag);
X_FLOAT min, max;
X_CFLOAT min, max;
min = 1.0 * BIG;
max = -1.0 * BIG;
@ -160,7 +160,7 @@ void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_group
if(n<128) threads.x=32;
else if(n<256) threads.x=64;
else threads.x=128;
sharedmem=n*sizeof(X_FLOAT);
sharedmem=n*sizeof(X_CFLOAT);
grid.x=6;
grid.y=1;
Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);

View File

@ -21,7 +21,7 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ X_FLOAT sharedmem[];
extern __shared__ X_CFLOAT sharedmem[];
#define BIG 1e10
__global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change)
@ -29,9 +29,9 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
int idim, otherdims;
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_FLOAT lo[3];
X_FLOAT hi[3];
X_FLOAT* period;
X_CFLOAT lo[3];
X_CFLOAT hi[3];
X_CFLOAT* period;
if(_triclinic == 0) {
lo[0] = _boxlo[0];
@ -54,11 +54,11 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
}
X_FLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
X_FLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
X_FLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
X_CFLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
X_CFLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
X_CFLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
X_FLOAT* buf = (X_FLOAT*) _buffer;
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
buf += blockIdx.x * gridDim.y + blockIdx.y;
buf[0] = tmpx;
buf += gridDim.x * gridDim.y;
@ -181,12 +181,12 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
__syncthreads();
if(box_change) {
X_FLOAT minx = BIG;
X_FLOAT maxx = -BIG;
X_FLOAT miny = BIG;
X_FLOAT maxy = -BIG;
X_FLOAT minz = BIG;
X_FLOAT maxz = -BIG;
X_CFLOAT minx = BIG;
X_CFLOAT maxx = -BIG;
X_CFLOAT miny = BIG;
X_CFLOAT maxy = -BIG;
X_CFLOAT minz = BIG;
X_CFLOAT maxz = -BIG;
if(not _periodicity[0]) {
sharedmem[threadIdx.x] = tmpx;
@ -231,7 +231,7 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
}
if(threadIdx.x == 0) {
buf = (X_FLOAT*) _buffer;
buf = (X_CFLOAT*) _buffer;
buf += blockIdx.x * gridDim.y + blockIdx.y;
buf[0] = minx;
buf += gridDim.x * gridDim.y;
@ -250,7 +250,7 @@ __global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box
__global__ void Domain_reduceBoxExtent(double* extent, int n)
{
X_FLOAT* buf = (X_FLOAT*) _buffer;
X_CFLOAT* buf = (X_CFLOAT*) _buffer;
buf += blockIdx.x * n;
copyGlobToShared(buf, sharedmem, n);
@ -267,8 +267,8 @@ __global__ void Domain_lamda2x_Kernel(int n)
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
X_FLOAT ytmp = _x[i + _nmax];
X_FLOAT ztmp = _x[i + 2 * _nmax];
X_CFLOAT ytmp = _x[i + _nmax];
X_CFLOAT ztmp = _x[i + 2 * _nmax];
_x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0];
_x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1];
_x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2];
@ -279,7 +279,7 @@ __global__ void Domain_x2lamda_Kernel(int n)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_FLOAT delta[3];
X_CFLOAT delta[3];
if(i < n) {
delta[0] = _x[i] - _boxlo[0];

View File

@ -25,15 +25,15 @@
#include "cuda_precision.h"
#include "cuda_common.h"
struct FFT_DATA {
FFT_FLOAT re;
FFT_FLOAT im;
FFT_CFLOAT re;
FFT_CFLOAT im;
};
#include "fft3d_cuda_cu.h"
#include "fft3d_cuda_kernel.cu"
#include <stdio.h>
void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow)
void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow)
{
dim3 grid;
@ -62,7 +62,7 @@ void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
threads.x = nfast * 2;
threads.y = 1;
threads.z = 1;
permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
cudaThreadSynchronize();
MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
}
@ -78,7 +78,7 @@ void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
threads.x = nfast * 2;
threads.y = 1;
threads.z = 1;
permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
cudaThreadSynchronize();
}
void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
@ -92,7 +92,7 @@ void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, i
threads.x = (khi - klo + 1) * 2;
threads.y = 1;
threads.z = 1;
permute_part_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
permute_part_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
cudaThreadSynchronize();
}

View File

@ -23,7 +23,7 @@
#include "cuda_shared.h"
extern "C" void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow);
extern "C" void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow);
extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo);

View File

@ -21,24 +21,24 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void initfftdata_kernel(double* in, FFT_FLOAT* out)
__global__ void initfftdata_kernel(double* in, FFT_CFLOAT* out)
{
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0;
}
__global__ void permute_kernel(FFT_FLOAT* in, FFT_FLOAT* out)
__global__ void permute_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
{
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
}
__global__ void permute_scale_kernel(FFT_FLOAT* in, FFT_FLOAT* out)
__global__ void permute_scale_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
{
out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5;
}
__global__ void permute_part_kernel(FFT_FLOAT* in, FFT_FLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
__global__ void permute_part_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
{
{
out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo];

View File

@ -33,10 +33,10 @@
void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT);
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -55,8 +55,8 @@ void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
@ -64,7 +64,7 @@ void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
Cuda_FixAddForceCuda_UpdateNmax(sdata);
}
void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal)
void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal)
{
if(sdata->atom.update_nmax)
Cuda_FixAddForceCuda_UpdateNmax(sdata);
@ -75,18 +75,18 @@ void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLO
if(sdata->buffer_new)
Cuda_FixAddForceCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 4;
threads.x = 512;
reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal);
reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");

View File

@ -24,4 +24,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal);
extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal);

View File

@ -21,10 +21,10 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue)
__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
@ -51,7 +51,7 @@ __global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
reduceBlock(&sharedmem[3 * blockDim.x]);
F_FLOAT* buffer = (F_FLOAT*) _buffer;
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
@ -63,12 +63,12 @@ __global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
}
__global__ void reduce_foriginal(int n, F_FLOAT* foriginal)
__global__ void reduce_foriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_FLOAT myforig = 0.0;
F_FLOAT* buf = (F_FLOAT*) _buffer;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {

View File

@ -33,10 +33,10 @@
void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT);
int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -55,8 +55,8 @@ void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
@ -64,7 +64,7 @@ void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
Cuda_FixAveForceCuda_UpdateNmax(sdata);
}
void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal)
void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal)
{
if(sdata->atom.update_nmax)
Cuda_FixAveForceCuda_UpdateNmax(sdata);
@ -75,25 +75,25 @@ void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,
if(sdata->buffer_new)
Cuda_FixAveForceCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit);
Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 4;
threads.x = 512;
Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal);
Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
}
void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue)
void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue)
{
int3 layout = getgrid(sdata->atom.nlocal);
dim3 threads(layout.z, 1, 1);

View File

@ -24,5 +24,5 @@
#include "cuda_shared.h"
extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal);
extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue);
extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal);
extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue);

View File

@ -21,7 +21,7 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
@ -44,7 +44,7 @@ __global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
reduceBlock(&sharedmem[3 * blockDim.x]);
F_FLOAT* buffer = (F_FLOAT*) _buffer;
F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
@ -55,12 +55,12 @@ __global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
}
__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal)
__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_FLOAT myforig = 0.0;
F_FLOAT* buf = (F_FLOAT*) _buffer;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*) _buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {
@ -81,7 +81,7 @@ __global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal)
foriginal[blockIdx.x] = myforig;
}
__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue)
__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

View File

@ -34,8 +34,8 @@ void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)

View File

@ -32,10 +32,10 @@
void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -55,9 +55,9 @@ void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*));
}
@ -68,7 +68,7 @@ void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
}
void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal)
void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal)
{
if(sdata->atom.update_nmax)
Cuda_FixFreezeCuda_UpdateNmax(sdata);
@ -80,18 +80,18 @@ void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT
Cuda_FixFreezeCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit);
Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 3;
threads.x = 512;
Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal);
Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");

View File

@ -24,4 +24,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal);
extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal);

View File

@ -21,7 +21,7 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
@ -49,7 +49,7 @@ __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
F_FLOAT* buffer = (F_FLOAT*)_buffer;
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
@ -59,12 +59,12 @@ __global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
}
__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal)
__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_FLOAT myforig = 0.0;
F_FLOAT* buf = (F_FLOAT*)_buffer;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {

View File

@ -32,10 +32,10 @@
void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -55,12 +55,12 @@ void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
}
void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
@ -70,7 +70,7 @@ void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
}
void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc)
void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
{
if(sdata->atom.update_nmax)
Cuda_FixGravityCuda_UpdateNmax(sdata);

View File

@ -24,4 +24,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc);
extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc);

View File

@ -21,13 +21,13 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc)
__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
F_FLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
F_CFLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
_f[i] += mass * xacc;
_f[i + 1 * _nmax] += mass * yacc;
_f[i + 2 * _nmax] += mass * zacc;

View File

@ -32,21 +32,21 @@
void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
@ -67,12 +67,12 @@ void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
{
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
Cuda_FixNHCuda_UpdateNmax(sdata);
@ -97,8 +97,8 @@ void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* fa
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_FLOAT3 factor2;
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_CFLOAT3 factor2;
if(p_triclinic) {
factor2.x = factor_h[3], factor2.y = factor_h[4];
@ -125,8 +125,8 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_FLOAT3 factor2;
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_CFLOAT3 factor2;
if(p_triclinic) {
factor2.x = factor_h[3], factor2.y = factor_h[4];
@ -143,7 +143,7 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro
}
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
@ -237,8 +237,8 @@ void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int gro
if(sdata->buffer_new)
Cuda_FixNHCuda_UpdateBuffer(sdata);
F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_FLOAT3 factor2;
F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
F_CFLOAT3 factor2;
if(p_triclinic) {
factor2.x = factor_h[3], factor2.y = factor_h[4];

View File

@ -23,9 +23,9 @@
#include "cuda_shared.h"
extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp
extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp

View File

@ -21,14 +21,14 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit)
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
{
if(_dist_check) {
X_FLOAT d = X_F(0.0);
X_CFLOAT d = X_F(0.0);
if(i < _nlocal) {
X_FLOAT tmp = xtmp - _xhold[i];
X_CFLOAT tmp = xtmp - _xhold[i];
d = tmp * tmp;
tmp = ytmp - _xhold[i + _maxhold];
d += tmp * tmp;
@ -43,15 +43,15 @@ static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLO
}
}
__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
V_FLOAT* my_v = _v + i;
V_FLOAT vx = my_v[0];
V_FLOAT vy = my_v[_nmax];
V_FLOAT vz = my_v[2 * _nmax];
V_CFLOAT* my_v = _v + i;
V_CFLOAT vx = my_v[0];
V_CFLOAT vy = my_v[_nmax];
V_CFLOAT vz = my_v[2 * _nmax];
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
@ -71,12 +71,12 @@ __global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p
}
__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_CFLOAT factor_eta)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
V_FLOAT* my_v = _v + i;
V_CFLOAT* my_v = _v + i;
my_v[0] *= factor_eta;
my_v[_nmax] *= factor_eta;
my_v[2 * _nmax] *= factor_eta;
@ -84,22 +84,22 @@ __global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
}
__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
V_FLOAT vx = my_v[0];
V_FLOAT vy = my_v[_nmax];
V_FLOAT vz = my_v[2 * _nmax];
V_CFLOAT vx = my_v[0];
V_CFLOAT vy = my_v[_nmax];
V_CFLOAT vz = my_v[2 * _nmax];
vx *= factor.x;
vy *= factor.y;
vz *= factor.z;
@ -125,10 +125,10 @@ __global__ void FixNHCuda_nve_v_Kernel(int groupbit)
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
@ -145,13 +145,13 @@ __global__ void FixNHCuda_nve_v_Kernel(int groupbit)
__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
{
X_FLOAT xtmp, ytmp, ztmp;
X_CFLOAT xtmp, ytmp, ztmp;
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
V_FLOAT* my_v = _v + i;
X_FLOAT* my_x = _x + i;
V_CFLOAT* my_v = _v + i;
X_CFLOAT* my_x = _x + i;
xtmp = *my_x += _dtv * *my_v;
my_v += _nmax;
@ -166,23 +166,23 @@ __global__ void FixNHCuda_nve_x_Kernel(int groupbit)
}
__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
V_FLOAT vx = my_v[0] + dtfm * my_f[0];
V_FLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
V_FLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
V_CFLOAT vx = my_v[0] + dtfm * my_f[0];
V_CFLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
V_CFLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
vx *= factor.x;
vy *= factor.y;

View File

@ -32,19 +32,19 @@
void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
@ -65,12 +65,12 @@ void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata
}
void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
{
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
Cuda_FixNVECuda_UpdateNmax(sdata);

View File

@ -23,6 +23,6 @@
#include "cuda_shared.h"
extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);

View File

@ -21,11 +21,11 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit)
static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
{
if(_dist_check) {
X_FLOAT tmp = xtmp - _xhold[i];
X_FLOAT d = tmp * tmp;
X_CFLOAT tmp = xtmp - _xhold[i];
X_CFLOAT d = tmp * tmp;
tmp = ytmp - _xhold[i + _maxhold];
d += tmp * tmp;
tmp = ztmp - _xhold[i + 2 * _maxhold];
@ -41,7 +41,7 @@ static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLO
__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
{
X_FLOAT xtmp, ytmp, ztmp;
X_CFLOAT xtmp, ytmp, ztmp;
#ifdef CUDA_USE_BINNING
const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
@ -50,16 +50,16 @@ __global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
const int i = 3 * blockDim.x * bin + threadIdx.x;
if(_mask[i] & groupbit) {
F_FLOAT* my_f = _binned_f + i;
V_FLOAT* my_v = _binned_v + i;
X_FLOAT* my_x = _binned_x + i;
F_CFLOAT* my_f = _binned_f + i;
V_CFLOAT* my_v = _binned_v + i;
X_CFLOAT* my_x = _binned_x + i;
V_FLOAT dtfm = _dtf
V_CFLOAT dtfm = _dtf
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
V_FLOAT v_mem;
V_CFLOAT v_mem;
v_mem = *my_v += dtfm * (*my_f);
xtmp = *my_x += _dtv * v_mem;
my_f += blockDim.x;
@ -80,16 +80,16 @@ __global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
X_FLOAT* my_x = _x + i;
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
X_CFLOAT* my_x = _x + i;
V_FLOAT dtfm = _dtf;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];
V_FLOAT v_mem;
V_CFLOAT v_mem;
v_mem = *my_v += dtfm * (*my_f);
xtmp = *my_x += _dtv * v_mem;
my_f += _nmax;
@ -119,10 +119,10 @@ __global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
const int i = 3 * blockDim.x * bin + threadIdx.x;
if(_mask[i] & groupbit) {
F_FLOAT* my_f = _binned_f + i;
V_FLOAT* my_v = _binned_v + i;
F_CFLOAT* my_f = _binned_f + i;
V_CFLOAT* my_v = _binned_v + i;
V_FLOAT dtfm = _dtf
V_CFLOAT dtfm = _dtf
if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
@ -142,10 +142,10 @@ __global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal && _mask[i] & groupbit) {
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
V_FLOAT dtfm = _dtf;
V_CFLOAT dtfm = _dtf;
if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
else dtfm *= V_F(1.0) / _mass[_type[i]];

View File

@ -32,10 +32,10 @@
void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
{
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -55,8 +55,8 @@ void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
}
void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
@ -66,7 +66,7 @@ void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
}
void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz)
void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz)
{
if(sdata->atom.update_nmax)
Cuda_FixSetForceCuda_UpdateNmax(sdata);
@ -78,18 +78,18 @@ void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLO
Cuda_FixSetForceCuda_UpdateBuffer(sdata);
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
int oldgrid = grid.x;
grid.x = 3;
threads.x = 512;
Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal);
Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");

View File

@ -24,4 +24,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz);
extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz);

View File

@ -21,10 +21,10 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
extern __shared__ F_FLOAT sharedmem[];
extern __shared__ F_CFLOAT sharedmem[];
__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, int flagx, int flagy, int flagz)
__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, int flagx, int flagy, int flagz)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
sharedmem[threadIdx.x] = 0;
@ -48,7 +48,7 @@ __global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
reduceBlock(sharedmem);
reduceBlock(&sharedmem[blockDim.x]);
reduceBlock(&sharedmem[2 * blockDim.x]);
F_FLOAT* buffer = (F_FLOAT*)_buffer;
F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
if(threadIdx.x == 0) {
buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
@ -58,12 +58,12 @@ __global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xval
}
__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal)
__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
{
int i = 0;
sharedmem[threadIdx.x] = 0;
F_FLOAT myforig = 0.0;
F_FLOAT* buf = (F_FLOAT*)_buffer;
F_CFLOAT myforig = 0.0;
F_CFLOAT* buf = (F_CFLOAT*)_buffer;
buf = &buf[blockIdx.x * n];
while(i < n) {

View File

@ -41,37 +41,37 @@
__device__ __constant__ int* _shake_atom;
__device__ __constant__ int* _shake_type;
__device__ __constant__ int* _shake_flag;
__device__ __constant__ X_FLOAT3* _xshake;
__device__ __constant__ F_FLOAT _dtfsq;
__device__ __constant__ X_FLOAT* _bond_distance;
__device__ __constant__ X_FLOAT* _angle_distance;
__device__ __constant__ X_CFLOAT3* _xshake;
__device__ __constant__ F_CFLOAT _dtfsq;
__device__ __constant__ X_CFLOAT* _bond_distance;
__device__ __constant__ X_CFLOAT* _angle_distance;
__device__ __constant__ int _max_iter;
__device__ __constant__ X_FLOAT _tolerance;
__device__ __constant__ X_CFLOAT _tolerance;
#include "fix_shake_cuda_kernel.cu"
void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata , sizeof(int*));
}
void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
{
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity , sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_CFLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(triclinic) , &sdata->domain.triclinic , sizeof(int));
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_FLOAT) * 6);
cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_CFLOAT) * 6);
}
void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
@ -89,10 +89,10 @@ void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
}
void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
void* bond_distance, void* angle_distance, void* virial,
int max_iter, X_FLOAT tolerance)
int max_iter, X_CFLOAT tolerance)
{
Cuda_FixShakeCuda_UpdateNmax(sdata);
Cuda_FixShakeCuda_UpdateDomain(sdata);
@ -100,17 +100,17 @@ void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
cudaMemcpyToSymbol(MY_AP(shake_type) , & shake_type , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(shake_flag) , & shake_flag , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(xshake) , & xshake , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_CFLOAT));
cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(bond_distance) , & bond_distance , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(angle_distance) , & angle_distance , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(virial) , & virial , sizeof(void*));
cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*));
cudaMemcpyToSymbol(MY_AP(max_iter) , &max_iter , sizeof(int));
cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_FLOAT));
cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_CFLOAT));
if(sdata->atom.mass_host)
cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); //
@ -149,16 +149,16 @@ void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom,
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_FLOAT), 64);
int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_CFLOAT), 64);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
if(sdata->buffer_new)
Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_FLOAT));
Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_CFLOAT));
BindXTypeTexture(sdata);
FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_FLOAT)>>> (vflag, vflag_atom, list, nlist);
FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_CFLOAT)>>> (vflag, vflag_atom, list, nlist);
cudaThreadSynchronize();
CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");
@ -168,7 +168,7 @@ void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom,
grid.x = 6;
grid.y = 1;
threads.x = 256;
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
}
@ -183,14 +183,14 @@ int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void*
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -212,7 +212,7 @@ int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void*
cudaMemset(sdata->flag, 0, sizeof(int));
FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz);
cudaThreadSynchronize();
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
int aflag;
cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
@ -232,16 +232,16 @@ int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, i
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
static int count = -1;
count++;
X_FLOAT dx = 0.0;
X_FLOAT dy = 0.0;
X_FLOAT dz = 0.0;
X_CFLOAT dx = 0.0;
X_CFLOAT dy = 0.0;
X_CFLOAT dz = 0.0;
if(pbc_flag != 0) {
if(sdata->domain.triclinic == 0) {
@ -278,7 +278,7 @@ void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, voi
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
int size = n * 3 * sizeof(X_FLOAT);
int size = n * 3 * sizeof(X_CFLOAT);
if(sdata->buffer_new or (size > sdata->buffersize))
Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
@ -288,7 +288,7 @@ void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, voi
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");

View File

@ -22,10 +22,10 @@
------------------------------------------------------------------------- */
#include "cuda_shared.h"
extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
void* bond_distance, void* angle_distance, void* virial,
int max_iter, X_FLOAT tolerance);
int max_iter, X_CFLOAT tolerance);
extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist);
extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);

View File

@ -21,12 +21,12 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_FLOAT total, ENERGY_FLOAT* v)
__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_CFLOAT total, ENERGY_CFLOAT* v)
{
/*if(vflag_global)
{
ENERGY_FLOAT fraction = n/total;
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
ENERGY_CFLOAT fraction = n/total;
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
*shared += fraction*v[0]; shared+=blockDim.x;
*shared += fraction*v[1]; shared+=blockDim.x;
*shared += fraction*v[2]; shared+=blockDim.x;
@ -35,11 +35,11 @@ __device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, E
*shared += fraction*v[5];
}*/
if(vflag_atom) {
ENERGY_FLOAT fraction = ENERGY_F(1.0) / total;
ENERGY_CFLOAT fraction = ENERGY_F(1.0) / total;
for(int i = 0; i < n; i++) {
int m = list[i];
ENERGY_FLOAT* myvatom = &_vatom[m];
ENERGY_CFLOAT* myvatom = &_vatom[m];
*myvatom += fraction * v[0];
myvatom += _nmax;
@ -56,7 +56,7 @@ __device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, E
}
}
inline __device__ void minimum_image(X_FLOAT3 &delta)
inline __device__ void minimum_image(X_CFLOAT3 &delta)
{
if(_triclinic == 0) {
if(_periodicity[0]) {
@ -106,14 +106,14 @@ __global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
if(i >= _nlocal) return;
X_FLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)};
X_CFLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)};
if(_shake_flag[i]) {
F_FLOAT* my_f = _f + i;
V_FLOAT* my_v = _v + i;
X_FLOAT* my_x = _x + i;
F_CFLOAT* my_f = _f + i;
V_CFLOAT* my_v = _v + i;
X_CFLOAT* my_x = _x + i;
V_FLOAT dtfmsq = _dtfsq;
V_CFLOAT dtfmsq = _dtfsq;
if(_rmass_flag) dtfmsq *= V_F(1.0) / _rmass[i];
else dtfmsq *= V_F(1.0) / _mass[_type[i]];
@ -138,20 +138,20 @@ __global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
__device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
{
int nlist, list[2];
ENERGY_FLOAT v[6];
X_FLOAT invmass0, invmass1;
ENERGY_CFLOAT v[6];
X_CFLOAT invmass0, invmass1;
// local atom IDs and constraint distances
int i0 = _map_array[_shake_atom[m]];
int i1 = _map_array[_shake_atom[m + _nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01;
X_CFLOAT3 r01;
X_FLOAT4 x_i0, x_i1;
X_CFLOAT4 x_i0, x_i1;
x_i0 = fetchXType(i0);
x_i1 = fetchXType(i1);
@ -162,9 +162,9 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01;
X_FLOAT3 xs_i0 = _xshake[i0];
X_FLOAT3 xs_i1 = _xshake[i1];
X_CFLOAT3 s01;
X_CFLOAT3 xs_i0 = _xshake[i0];
X_CFLOAT3 xs_i1 = _xshake[i1];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
@ -173,8 +173,8 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
// scalar distances between atoms
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
// a,b,c = coeffs in quadratic equation for lamda
@ -186,14 +186,14 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)];
}
X_FLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_FLOAT b = X_F(2.0) * (invmass0 + invmass1) *
X_CFLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_CFLOAT b = X_F(2.0) * (invmass0 + invmass1) *
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
X_FLOAT c = s01sq - bond1 * bond1;
X_CFLOAT c = s01sq - bond1 * bond1;
// error check
X_FLOAT determ = b * b - X_F(4.0) * a * c;
X_CFLOAT determ = b * b - X_F(4.0) * a * c;
if(determ < X_F(0.0)) {
_flag[0]++;
@ -202,7 +202,7 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
// exact quadratic solution for lamda
X_FLOAT lamda, lamda1, lamda2;
X_CFLOAT lamda, lamda1, lamda2;
lamda1 = -b + _SQRT_(determ);
lamda2 = -lamda1 - X_F(2.0) * b;
lamda1 *= X_F(1.0) / (X_F(2.0) * a);
@ -233,8 +233,8 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
}
if(vflag || vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor = nlist;
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
X_CFLOAT factor = nlist;
v[0] = lamda * r01.x * r01.x;
*shared = factor * v[0];
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
@ -262,22 +262,22 @@ __device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
__device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
{
int nlist, list[3];
ENERGY_FLOAT v[6];
X_FLOAT invmass0, invmass1, invmass2;
ENERGY_CFLOAT v[6];
X_CFLOAT invmass0, invmass1, invmass2;
// local atom IDs and constraint distances
int i0 = _map_array[_shake_atom[m]];
int i1 = _map_array[_shake_atom[m + _nmax]];
int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01, r02;
X_CFLOAT3 r01, r02;
X_FLOAT4 x_i0, x_i1, x_i2;
X_CFLOAT4 x_i0, x_i1, x_i2;
x_i0 = fetchXType(i0);
x_i1 = fetchXType(i1);
x_i2 = fetchXType(i2);
@ -294,10 +294,10 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01, s02;
X_FLOAT3 xs_i0 = _xshake[i0];
X_FLOAT3 xs_i1 = _xshake[i1];
X_FLOAT3 xs_i2 = _xshake[i2];
X_CFLOAT3 s01, s02;
X_CFLOAT3 xs_i0 = _xshake[i0];
X_CFLOAT3 xs_i1 = _xshake[i1];
X_CFLOAT3 xs_i2 = _xshake[i2];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
@ -311,10 +311,10 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
// scalar distances between atoms
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
// a,b,c = coeffs in quadratic equation for lamda
@ -328,48 +328,48 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)];
}
X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
X_FLOAT a12 = X_F(2.0) * invmass0 *
X_CFLOAT a12 = X_F(2.0) * invmass0 *
(s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
X_FLOAT a21 = X_F(2.0) * invmass0 *
X_CFLOAT a21 = X_F(2.0) * invmass0 *
(s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
(s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
// error check
X_FLOAT determ = a11 * a22 - a12 * a21;
X_CFLOAT determ = a11 * a22 - a12 * a21;
if(determ == X_F(0.0)) _flag[0]++;
X_FLOAT determinv = X_F(1.0) / determ;
X_CFLOAT determinv = X_F(1.0) / determ;
X_FLOAT a11inv = a22 * determinv;
X_FLOAT a12inv = -a12 * determinv;
X_FLOAT a21inv = -a21 * determinv;
X_FLOAT a22inv = a11 * determinv;
X_CFLOAT a11inv = a22 * determinv;
X_CFLOAT a12inv = -a12 * determinv;
X_CFLOAT a21inv = -a21 * determinv;
X_CFLOAT a22inv = a11 * determinv;
// quadratic correction coeffs
X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
// iterate until converged
X_FLOAT lamda01 = X_F(0.0);
X_FLOAT lamda02 = X_F(0.0);
X_CFLOAT lamda01 = X_F(0.0);
X_CFLOAT lamda02 = X_F(0.0);
int niter = 0;
int done = 0;
X_FLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new;
X_CFLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new;
//maybe all running full loop?
while(__any(!done) && niter < _max_iter) {
@ -425,8 +425,8 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
}
if(vflag || vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
X_CFLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x;
*shared = factor * v[0];
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
@ -453,8 +453,8 @@ __device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
__device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
{
int nlist, list[4];
ENERGY_FLOAT v[6];
X_FLOAT invmass0, invmass1, invmass2, invmass3;
ENERGY_CFLOAT v[6];
X_CFLOAT invmass0, invmass1, invmass2, invmass3;
// local atom IDs and constraint distances
@ -462,15 +462,15 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
int i1 = _map_array[_shake_atom[m + _nmax]];
int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
int i3 = _map_array[_shake_atom[m + 3 * _nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
X_FLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]];
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
X_CFLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01, r02, r03;
X_CFLOAT3 r01, r02, r03;
X_FLOAT4 x_i0, x_i1, x_i2, x_i3;
X_CFLOAT4 x_i0, x_i1, x_i2, x_i3;
x_i0 = fetchXType(i0);
x_i1 = fetchXType(i1);
x_i2 = fetchXType(i2);
@ -493,11 +493,11 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01, s02, s03;
X_FLOAT3 xs_i0 = _xshake[i0];
X_FLOAT3 xs_i1 = _xshake[i1];
X_FLOAT3 xs_i2 = _xshake[i2];
X_FLOAT3 xs_i3 = _xshake[i3];
X_CFLOAT3 s01, s02, s03;
X_CFLOAT3 xs_i0 = _xshake[i0];
X_CFLOAT3 xs_i1 = _xshake[i1];
X_CFLOAT3 xs_i2 = _xshake[i2];
X_CFLOAT3 xs_i3 = _xshake[i3];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
@ -516,12 +516,12 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
// scalar distances between atoms
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
X_FLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z;
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
X_FLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z;
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
X_CFLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z;
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
X_CFLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z;
// a,b,c = coeffs in quadratic equation for lamda
@ -537,79 +537,79 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
invmass3 = X_F(1.0) / _mass[static_cast <int>(x_i3.w)];
}
X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
X_FLOAT a12 = X_F(2.0) * invmass0 *
X_CFLOAT a12 = X_F(2.0) * invmass0 *
(s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
X_FLOAT a13 = X_F(2.0) * invmass0 *
X_CFLOAT a13 = X_F(2.0) * invmass0 *
(s01.x * r03.x + s01.y * r03.y + s01.z * r03.z);
X_FLOAT a21 = X_F(2.0) * invmass0 *
X_CFLOAT a21 = X_F(2.0) * invmass0 *
(s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
(s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
X_FLOAT a23 = X_F(2.0) * (invmass0) *
X_CFLOAT a23 = X_F(2.0) * (invmass0) *
(s02.x * r03.x + s02.y * r03.y + s02.z * r03.z);
X_FLOAT a31 = X_F(2.0) * (invmass0) *
X_CFLOAT a31 = X_F(2.0) * (invmass0) *
(s03.x * r01.x + s03.y * r01.y + s03.z * r01.z);
X_FLOAT a32 = X_F(2.0) * (invmass0) *
X_CFLOAT a32 = X_F(2.0) * (invmass0) *
(s03.x * r02.x + s03.y * r02.y + s03.z * r02.z);
X_FLOAT a33 = X_F(2.0) * (invmass0 + invmass3) *
X_CFLOAT a33 = X_F(2.0) * (invmass0 + invmass3) *
(s03.x * r03.x + s03.y * r03.y + s03.z * r03.z);
// error check
X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
X_CFLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
if(determ == X_F(0.0)) _flag[0]++;
X_FLOAT determinv = X_F(1.0) / determ;
X_CFLOAT determinv = X_F(1.0) / determ;
X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
X_CFLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
X_CFLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
X_CFLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
X_CFLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
X_CFLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
X_CFLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
X_CFLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
X_CFLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
X_CFLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
// quadratic correction coeffs
X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
X_FLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z);
X_FLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z);
X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
X_CFLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z);
X_CFLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z);
X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
X_FLOAT quad1_0303 = invmass0 * invmass0 * r03sq;
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
X_FLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103;
X_FLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203;
X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
X_CFLOAT quad1_0303 = invmass0 * invmass0 * r03sq;
X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
X_CFLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103;
X_CFLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203;
X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
X_FLOAT quad2_0303 = invmass0 * invmass0 * r03sq;
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
X_FLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103;
X_FLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203;
X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
X_CFLOAT quad2_0303 = invmass0 * invmass0 * r03sq;
X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
X_CFLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103;
X_CFLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203;
X_FLOAT quad3_0101 = invmass0 * invmass0 * r01sq;
X_FLOAT quad3_0202 = invmass0 * invmass0 * r02sq;
X_FLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq;
X_FLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102;
X_FLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103;
X_FLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203;
X_CFLOAT quad3_0101 = invmass0 * invmass0 * r01sq;
X_CFLOAT quad3_0202 = invmass0 * invmass0 * r02sq;
X_CFLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq;
X_CFLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102;
X_CFLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103;
X_CFLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203;
// iterate until converged
X_FLOAT lamda01 = X_F(0.0);
X_FLOAT lamda02 = X_F(0.0);
X_FLOAT lamda03 = X_F(0.0);
X_CFLOAT lamda01 = X_F(0.0);
X_CFLOAT lamda02 = X_F(0.0);
X_CFLOAT lamda03 = X_F(0.0);
int niter = 0;
int done = 0;
X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new;
X_CFLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new;
//maybe all running full loop?
while(__any(!done) && niter < _max_iter) {
@ -692,8 +692,8 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
}
if(vflag || vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor = X_F(2.0) / X_F(4.0) * nlist;
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
X_CFLOAT factor = X_F(2.0) / X_F(4.0) * nlist;
v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda03 * r03.x * r03.x;
*shared = factor * v[0];
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
@ -720,23 +720,23 @@ __device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
__device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
{
int nlist, list[3];
ENERGY_FLOAT v[6];
X_FLOAT invmass0, invmass1, invmass2;
ENERGY_CFLOAT v[6];
X_CFLOAT invmass0, invmass1, invmass2;
// local atom IDs and constraint distances
int i0 = _map_array[_shake_atom[m]];
int i1 = _map_array[_shake_atom[m + _nmax]];
int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
X_FLOAT bond1 = _bond_distance[_shake_type[m]];
X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
X_FLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]];
X_CFLOAT bond1 = _bond_distance[_shake_type[m]];
X_CFLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
X_CFLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]];
// r01 = distance vec between atoms, with PBC
X_FLOAT3 r01, r02, r12;
X_CFLOAT3 r01, r02, r12;
X_FLOAT4 x_i0, x_i1, x_i2;
X_CFLOAT4 x_i0, x_i1, x_i2;
x_i0 = fetchXType(i0);
x_i1 = fetchXType(i1);
x_i2 = fetchXType(i2);
@ -758,10 +758,10 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
// s01 = distance vec after unconstrained update, with PBC
X_FLOAT3 s01, s02, s12;
X_FLOAT3 xs_i0 = _xshake[i0];
X_FLOAT3 xs_i1 = _xshake[i1];
X_FLOAT3 xs_i2 = _xshake[i2];
X_CFLOAT3 s01, s02, s12;
X_CFLOAT3 xs_i0 = _xshake[i0];
X_CFLOAT3 xs_i1 = _xshake[i1];
X_CFLOAT3 xs_i2 = _xshake[i2];
s01.x = xs_i0.x - xs_i1.x;
s01.y = xs_i0.y - xs_i1.y;
@ -780,12 +780,12 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
// scalar distances between atoms
X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
X_FLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z;
X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
X_FLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z;
X_CFLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
X_CFLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
X_CFLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z;
X_CFLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
X_CFLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
X_CFLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z;
// a,b,c = coeffs in quadratic equation for lamda
@ -799,79 +799,79 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)];
}
X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
X_CFLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
(s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
X_FLOAT a12 = X_F(2.0) * invmass0 *
X_CFLOAT a12 = X_F(2.0) * invmass0 *
(s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
X_FLOAT a13 = - X_F(2.0) * invmass1 *
X_CFLOAT a13 = - X_F(2.0) * invmass1 *
(s01.x * r12.x + s01.y * r12.y + s01.z * r12.z);
X_FLOAT a21 = X_F(2.0) * invmass0 *
X_CFLOAT a21 = X_F(2.0) * invmass0 *
(s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
X_CFLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
(s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
X_FLOAT a23 = X_F(2.0) * invmass2 *
X_CFLOAT a23 = X_F(2.0) * invmass2 *
(s02.x * r12.x + s02.y * r12.y + s02.z * r12.z);
X_FLOAT a31 = - X_F(2.0) * invmass1 *
X_CFLOAT a31 = - X_F(2.0) * invmass1 *
(s12.x * r01.x + s12.y * r01.y + s12.z * r01.z);
X_FLOAT a32 = X_F(2.0) * invmass2 *
X_CFLOAT a32 = X_F(2.0) * invmass2 *
(s12.x * r02.x + s12.y * r02.y + s12.z * r02.z);
X_FLOAT a33 = X_F(2.0) * (invmass1 + invmass2) *
X_CFLOAT a33 = X_F(2.0) * (invmass1 + invmass2) *
(s12.x * r12.x + s12.y * r12.y + s12.z * r12.z);
// inverse of matrix
X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
X_CFLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
if(determ == X_F(0.0)) _flag[0]++;
X_FLOAT determinv = X_F(1.0) / determ;
X_CFLOAT determinv = X_F(1.0) / determ;
X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
X_CFLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
X_CFLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
X_CFLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
X_CFLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
X_CFLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
X_CFLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
X_CFLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
X_CFLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
X_CFLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
// quadratic correction coeffs
X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
X_FLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z);
X_FLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z);
X_CFLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
X_CFLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z);
X_CFLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z);
X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
X_FLOAT quad1_1212 = invmass1 * invmass1 * r12sq;
X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112;
X_FLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212;
X_CFLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
X_CFLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
X_CFLOAT quad1_1212 = invmass1 * invmass1 * r12sq;
X_CFLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
X_CFLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112;
X_CFLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212;
X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
X_FLOAT quad2_1212 = invmass2 * invmass2 * r12sq;
X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
X_FLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112;
X_FLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212;
X_CFLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
X_CFLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
X_CFLOAT quad2_1212 = invmass2 * invmass2 * r12sq;
X_CFLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
X_CFLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112;
X_CFLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212;
X_FLOAT quad3_0101 = invmass1 * invmass1 * r01sq;
X_FLOAT quad3_0202 = invmass2 * invmass2 * r02sq;
X_FLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq;
X_FLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102;
X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112;
X_FLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212;
X_CFLOAT quad3_0101 = invmass1 * invmass1 * r01sq;
X_CFLOAT quad3_0202 = invmass2 * invmass2 * r02sq;
X_CFLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq;
X_CFLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102;
X_CFLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112;
X_CFLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212;
// iterate until converged
X_FLOAT lamda01 = X_F(0.0);
X_FLOAT lamda02 = X_F(0.0);
X_FLOAT lamda12 = X_F(0.0);
X_CFLOAT lamda01 = X_F(0.0);
X_CFLOAT lamda02 = X_F(0.0);
X_CFLOAT lamda12 = X_F(0.0);
int niter = 0;
int done = 0;
X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new;
X_CFLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new;
//maybe all running full loop?
while(__any(!done) && niter < _max_iter) {
@ -947,8 +947,8 @@ __device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
}
if(vflag || vflag_atom) {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
X_CFLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda12 * r12.x * r12.x;
*shared = factor * v[0];
shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
@ -986,7 +986,7 @@ __global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list,
else if(sflag == 4) FixShakeCuda_Shake4(vflag, vflag_atom, m);
else FixShakeCuda_Shake3Angle(vflag, vflag_atom, m);
} else {
ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
ENERGY_CFLOAT* shared = &sharedmem[threadIdx.x];
*shared = ENERGY_F(0.0);
shared += blockDim.x;
*shared = ENERGY_F(0.0);
@ -1008,7 +1008,7 @@ __global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list,
}
__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz)
__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -1018,15 +1018,15 @@ __global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistle
if(j > _nmax) _flag[0] = 1;
X_FLOAT3 xs = _xshake[j];
((X_FLOAT*) _buffer)[i] = xs.x + dx;
((X_FLOAT*) _buffer)[i + 1 * n] = xs.y + dy;
((X_FLOAT*) _buffer)[i + 2 * n] = xs.z + dz;
X_CFLOAT3 xs = _xshake[j];
((X_CFLOAT*) _buffer)[i] = xs.x + dx;
((X_CFLOAT*) _buffer)[i + 1 * n] = xs.y + dy;
((X_CFLOAT*) _buffer)[i + 2 * n] = xs.z + dz;
}
}
__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -1036,7 +1036,7 @@ __global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxl
if(j > _nmax) _flag[0] = 1;
X_FLOAT3 xs = _xshake[j];
X_CFLOAT3 xs = _xshake[j];
xs.x += dx;
xs.y += dy;
xs.z += dz;
@ -1050,10 +1050,10 @@ __global__ void FixShakeCuda_UnpackComm_Kernel(int n, int first)
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < n) {
X_FLOAT3 xs;
xs.x = ((X_FLOAT*) _buffer)[i];
xs.y = ((X_FLOAT*) _buffer)[i + 1 * n];
xs.z = ((X_FLOAT*) _buffer)[i + 2 * n];
X_CFLOAT3 xs;
xs.x = ((X_CFLOAT*) _buffer)[i];
xs.y = ((X_CFLOAT*) _buffer)[i + 1 * n];
xs.z = ((X_CFLOAT*) _buffer)[i + 2 * n];
_xshake[i + first] = xs;
}
}

View File

@ -36,7 +36,7 @@ void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
}
void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
@ -48,7 +48,7 @@ void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
{
V_FLOAT factor = afactor;
V_CFLOAT factor = afactor;
if(sdata->atom.update_nmax)
Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);

View File

@ -23,7 +23,7 @@
__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor)
__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

View File

@ -36,7 +36,7 @@ void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
}
void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
@ -48,7 +48,7 @@ void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
{
V_FLOAT factor = afactor;
V_CFLOAT factor = afactor;
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)

View File

@ -23,7 +23,7 @@
__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor)
__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

View File

@ -36,7 +36,7 @@ void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_CFLOAT*));
}
void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
@ -48,7 +48,7 @@ void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit)
{
V_FLOAT factor = afactor;
V_CFLOAT factor = afactor;
//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
//if(sdata->atom.update_nlocal)

View File

@ -23,15 +23,15 @@
__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor, V_FLOAT limit)
__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor, V_CFLOAT limit)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
V_FLOAT vx = _v[i];
V_FLOAT vy = _v[i + _nmax];
V_FLOAT vz = _v[i + 2 * _nmax];
V_CFLOAT vx = _v[i];
V_CFLOAT vy = _v[i + _nmax];
V_CFLOAT vz = _v[i + 2 * _nmax];
vx *= factor;
vy *= factor;
vz *= factor;

View File

@ -35,8 +35,8 @@ void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
}
@ -60,7 +60,7 @@ void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void*
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_FLOAT*) gamma);
Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_CFLOAT*) gamma);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");

View File

@ -21,13 +21,13 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_FLOAT* gamma)
__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_CFLOAT* gamma)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nlocal)
if(_mask[i] & groupbit) {
F_FLOAT drag = gamma[_type[i]];
F_CFLOAT drag = gamma[_type[i]];
_f[i] -= drag * _v[i];
_f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax];
_f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax];

View File

@ -38,7 +38,7 @@
#define _nex_group MY_AP(nex_group)
#define _ex_mol_bit MY_AP(ex_mol_bit)
#define _nex_mol MY_AP(nex_mol)
__device__ __constant__ CUDA_FLOAT* _cutneighsq;
__device__ __constant__ CUDA_CFLOAT* _cutneighsq;
__device__ __constant__ int* _ex_type;
__device__ __constant__ int _nex_type;
__device__ __constant__ int* _ex1_bit;
@ -54,7 +54,7 @@ void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist*
{
CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_FLOAT)));
int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT)));
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -77,7 +77,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);
// initialize only on first call
CUDA_FLOAT rez_bin_size[3] = {
CUDA_CFLOAT rez_bin_size[3] = {
(1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
(1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
(1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
@ -87,10 +87,10 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
if(! init) {
init = 0;
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_CFLOAT) * 3);
}
@ -101,7 +101,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
my_times starttime, endtime;
my_gettime(CLOCK_REALTIME, &starttime);
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
cudaThreadSynchronize();
@ -126,7 +126,7 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
//Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
CUDA_FLOAT globcutoff = -1.0;
CUDA_CFLOAT globcutoff = -1.0;
short init = 0;
@ -137,11 +137,11 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
//printf("Allocate: %i\n",nx);
sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx);
sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx);
if(sneighlist->cutneighsq) {
int cutoffsdiffer = 0;
@ -149,13 +149,13 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
}
}
if(not cutoffsdiffer) globcutoff = (CUDA_FLOAT) cutoff0;
if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0;
} else {
MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
return 0;
@ -173,7 +173,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
}
CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_FLOAT*));
cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(special_flag) , sdata->atom.special_flag , 4 * sizeof(int));
@ -218,14 +218,14 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);
//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax);
//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax);
int buffer[20];
buffer[0] = 1;
buffer[1] = 0;
CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
//cudaMemset(sdata->debugdata,0,100*sizeof(int));
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x;
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x;
MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
//shared_size=2056;
my_times starttime, endtime;
@ -245,7 +245,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
(sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
}
//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>>
//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_CFLOAT))*threads.x+sizeof(int)>>>
// (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
cudaThreadSynchronize();
@ -301,13 +301,13 @@ int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sn
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
if(sneighlist->cutneighsq) {
for(int i = 1; i <= sdata->atom.ntypes; ++i) {
for(int j = 1; j <= sdata->atom.ntypes; ++j) {
acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
//printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
}
}
@ -339,7 +339,7 @@ int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sn
cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int));
cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int));
free(acutneighsq);

View File

@ -24,26 +24,26 @@
#define SBBITS 30
__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z)
CUDA_CFLOAT rez_bin_size_x, CUDA_CFLOAT rez_bin_size_y, CUDA_CFLOAT rez_bin_size_z)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
/*int* bin_count=(int*) _buffer;
bin_count=bin_count+20;
CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
CUDA_CFLOAT* binned_x=(CUDA_CFLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
if(i < _nall) {
// copy atom position from global device memory to local register
// in this 3 steps to get as much coalesced access as possible
X_FLOAT* my_x = _x + i;
CUDA_FLOAT x_i = *my_x;
X_CFLOAT* my_x = _x + i;
CUDA_CFLOAT x_i = *my_x;
my_x += _nmax;
CUDA_FLOAT y_i = *my_x;
CUDA_CFLOAT y_i = *my_x;
my_x += _nmax;
CUDA_FLOAT z_i = *my_x;
CUDA_CFLOAT z_i = *my_x;
// calculate flat bin index
@ -102,7 +102,7 @@ __device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
return 0;
}
extern __shared__ CUDA_FLOAT shared[];
extern __shared__ CUDA_CFLOAT shared[];
__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
{
@ -114,12 +114,12 @@ __device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
}
template <const unsigned int exclude>
__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall)
__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style, bool neighall)
{
int natoms = neighall ? _nall : _nlocal;
//const bool domol=false;
int bin_dim_z = gridDim.y;
CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
@ -129,19 +129,19 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi
int bin_c = bin_count[bin];
CUDA_FLOAT cut;
CUDA_CFLOAT cut;
if(globcutoff > 0)
cut = globcutoff;
int i = _nall;
CUDA_FLOAT* my_x;
CUDA_FLOAT x_i, y_i, z_i;
CUDA_CFLOAT* my_x;
CUDA_CFLOAT x_i, y_i, z_i;
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
int actIdx = threadIdx.x + actOffset;
CUDA_FLOAT* other_x = shared;
CUDA_CFLOAT* other_x = shared;
int* other_id = (int*) &other_x[3 * blockDim.x];
if(actIdx < bin_c) {
@ -206,10 +206,10 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[kk];
CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
CUDA_CFLOAT delx = x_i - other_x[kk];
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
@ -268,10 +268,10 @@ __global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bi
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[k];
CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
CUDA_CFLOAT delx = x_i - other_x[k];
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
if(jnum < _maxneighbors) {
@ -378,10 +378,10 @@ __global__ void FindSpecial(int block_style)
_numneigh[i] = jnum;
}
__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style)
__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style)
{
int bin_dim_z = gridDim.y;
CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
binned_x = &binned_x[2];
int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
@ -391,19 +391,19 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_
int bin_c = bin_count[bin];
CUDA_FLOAT cut;
CUDA_CFLOAT cut;
if(globcutoff > 0)
cut = globcutoff;
int i = _nall;
CUDA_FLOAT* my_x;
CUDA_FLOAT x_i, y_i, z_i;
CUDA_CFLOAT* my_x;
CUDA_CFLOAT x_i, y_i, z_i;
for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
int actIdx = threadIdx.x + actOffset;
CUDA_FLOAT* other_x = shared;
CUDA_CFLOAT* other_x = shared;
int* other_id = (int*) &other_x[3 * blockDim.x];
if(actIdx < bin_c) {
@ -469,10 +469,10 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[kk];
CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
CUDA_CFLOAT delx = x_i - other_x[kk];
CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
@ -549,10 +549,10 @@ __global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_
cut = _cutneighsq[itype * _cuda_ntypes + jtype];
}
CUDA_FLOAT delx = x_i - other_x[k];
CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
CUDA_CFLOAT delx = x_i - other_x[k];
CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq <= cut && i != j) {
if((j >= _nlocal) && (i_border < 0))
@ -612,12 +612,12 @@ __global__ void NeighborBuildFullNsq_Kernel()
int* buffer = (int*) _buffer;
if(i < _nlocal) {
X_FLOAT* my_x = _x + i;
CUDA_FLOAT x_i = *my_x;
X_CFLOAT* my_x = _x + i;
CUDA_CFLOAT x_i = *my_x;
my_x += _nmax;
CUDA_FLOAT y_i = *my_x;
CUDA_CFLOAT y_i = *my_x;
my_x += _nmax;
CUDA_FLOAT z_i = *my_x;
CUDA_CFLOAT z_i = *my_x;
int jnum = 0;
int* jlist = _firstneigh[i];
_ilist[i] = i;
@ -627,15 +627,15 @@ __global__ void NeighborBuildFullNsq_Kernel()
for(int j = 0; j < _nall; ++j) {
my_x = _x + j;
CUDA_FLOAT x_j = *my_x;
CUDA_CFLOAT x_j = *my_x;
my_x += _nmax;
CUDA_FLOAT y_j = *my_x;
CUDA_CFLOAT y_j = *my_x;
my_x += _nmax;
CUDA_FLOAT z_j = *my_x;
CUDA_FLOAT delx = x_i - x_j;
CUDA_FLOAT dely = y_i - y_j;
CUDA_FLOAT delz = z_i - z_j;
CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
CUDA_CFLOAT z_j = *my_x;
CUDA_CFLOAT delx = x_i - x_j;
CUDA_CFLOAT dely = y_i - y_j;
CUDA_CFLOAT delz = z_i - z_j;
CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
int jtype = _type[j];
if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {

View File

@ -60,10 +60,10 @@ void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BORN, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BORN, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -20,13 +20,13 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
__device__ inline F_CFLOAT PairBornCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
const F_FLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
const F_FLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
const F_CFLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
const F_CFLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv;
if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv

View File

@ -58,10 +58,10 @@ void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sn
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -59,10 +59,10 @@ void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -60,10 +60,10 @@ void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -20,13 +20,13 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
__device__ inline F_CFLOAT PairBuckCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT rexp = _EXP_(-r * _rhoinv[ij_type]);
const F_FLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT rexp = _EXP_(-r * _rhoinv[ij_type]);
const F_CFLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv;
if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv -
_offset[ij_type]);

View File

@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -65,10 +65,10 @@ void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -71,10 +71,10 @@ void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -21,28 +21,28 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4
__device__ inline F_CFLOAT PairCGCMMCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl) //0.11 of 0.4
{
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const int cg_type = _cg_type[ij_type];
const F_FLOAT r4inv = r2inv * r2inv;
const F_FLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq);
const F_FLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0);
const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second);
const F_CFLOAT r4inv = r2inv * r2inv;
const F_CFLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq);
const F_CFLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0);
const F_CFLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second);
if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]);
return factor_lj * forcelj * r2inv;
}
/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
/*__device__ inline F_CFLOAT PairCGCMMCuda_Eval(const F_CFLOAT& rsq,const int ij_type,F_CFLOAT& factor_lj,int& eflag, ENERGY_CFLOAT& evdwl)
{
const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type);
const F_FLOAT r2inv = F_F(1.0)/rsq;
const F_FLOAT r4inv = r2inv*r2inv;
const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0);
const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second);
const F_CFLOAT r2inv = F_F(1.0)/rsq;
const F_CFLOAT r4inv = r2inv*r2inv;
const F_CFLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
const F_CFLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0);
const F_CFLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second);
if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second));
return factor_lj*forcelj*r2inv;

View File

@ -39,18 +39,18 @@
#define _rho MY_AP(rho)
#define _fp MY_AP(fp)
__device__ __constant__ F_FLOAT MY_AP(rdr);
__device__ __constant__ F_FLOAT MY_AP(rdrho);
__device__ __constant__ F_CFLOAT MY_AP(rdr);
__device__ __constant__ F_CFLOAT MY_AP(rdrho);
__device__ __constant__ int MY_AP(nr);
__device__ __constant__ int MY_AP(nrho);
__device__ __constant__ int MY_AP(nfrho);
__device__ __constant__ int MY_AP(nrhor);
__device__ __constant__ int MY_AP(nz2r);
__device__ __constant__ F_FLOAT* MY_AP(frho_spline);
__device__ __constant__ F_FLOAT* MY_AP(rhor_spline);
__device__ __constant__ F_FLOAT* MY_AP(z2r_spline);
__device__ __constant__ F_FLOAT* MY_AP(rho);
__device__ __constant__ F_FLOAT* MY_AP(fp);
__device__ __constant__ F_CFLOAT* MY_AP(frho_spline);
__device__ __constant__ F_CFLOAT* MY_AP(rhor_spline);
__device__ __constant__ F_CFLOAT* MY_AP(z2r_spline);
__device__ __constant__ F_CFLOAT* MY_AP(rho);
__device__ __constant__ F_CFLOAT* MY_AP(fp);
#define _rhor_spline_tex MY_AP(rhor_spline_tex)
#if F_PRECISION == 1
@ -115,10 +115,10 @@ inline void BindEAMTextures(cuda_shared_data* sdata)
void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT));
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT);
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -151,13 +151,13 @@ void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighl
void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
}
@ -175,18 +175,18 @@ void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, in
"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
unsigned nI = sizeof(F_CFLOAT) * cuda_ntypes * cuda_ntypes;
X_FLOAT cutsq_global;
cutsq_global = (X_FLOAT)(sdata->pair.cut_global);
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT));
X_CFLOAT cutsq_global;
cutsq_global = (X_CFLOAT)(sdata->pair.cut_global);
cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_CFLOAT));
F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes];
F_CFLOAT* coeff_buf = new F_CFLOAT[cuda_ntypes * cuda_ntypes];
for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i];
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_CFLOAT));
for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i];
@ -197,34 +197,34 @@ void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, in
cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI);
delete [] coeff_buf;
X_FLOAT box_size[3] = {
X_CFLOAT box_size[3] = {
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
F_FLOAT rdr_F = rdr;
F_FLOAT rdrho_F = rdrho;
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
F_CFLOAT rdr_F = rdr;
F_CFLOAT rdrho_F = rdrho;
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int));
cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int));
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_CFLOAT);
z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_CFLOAT);
rhor_spline_pointer = rhor_spline;
z2r_spline_pointer = z2r_spline;
@ -249,8 +249,8 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
if(sdata->buffer_new)
Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist);
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
int sharedperproc = 0;
@ -258,7 +258,7 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
if(vflag || vflag_atom) sharedperproc = 7;
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
@ -270,7 +270,7 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
@ -288,7 +288,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
if(vflag || vflag_atom) sharedperproc = 7;
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
@ -300,7 +300,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
@ -310,7 +310,7 @@ void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis
grid.x = sharedperproc;
grid.y = 1;
threads.x = 256;
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)*sharedperproc>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
}
@ -324,19 +324,19 @@ void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void*
int3 layout = getgrid(n, 0);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
F_CFLOAT* buf = (F_CFLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, buf);
cudaThreadSynchronize();
cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
cudaMemcpy(buf_send, buf, n* sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
}
void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp)
{
F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice);
F_CFLOAT* fp_first = &(((F_CFLOAT*) fp)[first]);
cudaMemcpy(fp_first, buf_recv, n * sizeof(F_CFLOAT), cudaMemcpyHostToDevice);
}
#undef _type2frho

View File

@ -24,7 +24,7 @@
static __device__ inline F_FLOAT4 fetchRhor(int i)
static __device__ inline F_CFLOAT4 fetchRhor(int i)
{
#ifdef CUDA_USE_TEXTURE
#if F_PRECISION == 1
@ -37,7 +37,7 @@ static __device__ inline F_FLOAT4 fetchRhor(int i)
#endif
}
static __device__ inline F_FLOAT4 fetchZ2r(int i)
static __device__ inline F_CFLOAT4 fetchZ2r(int i)
{
#ifdef CUDA_USE_TEXTURE
#if F_PRECISION == 1
@ -52,8 +52,8 @@ static __device__ inline F_FLOAT4 fetchZ2r(int i)
__global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vflag_atom)
{
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
ENERGY_CFLOAT* sharedE;
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag || eflag_atom) {
@ -73,9 +73,9 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_FLOAT xtmp, ytmp, ztmp;
X_FLOAT4 myxtype;
F_FLOAT delx, dely, delz;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT delx, dely, delz;
int itype;
int i = _nlocal;
int jnum = 0;
@ -109,17 +109,17 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < _cutsq_global) {
F_FLOAT p = sqrt(rsq) * _rdr + F_F(1.0);
F_CFLOAT p = sqrt(rsq) * _rdr + F_F(1.0);
int m = static_cast<int>(p);
m = MIN(m, _nr - 1);
p -= m;
p = MIN(p, F_F(1.0));
int k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2;
F_FLOAT4 c = fetchRhor(k + 1);
F_CFLOAT4 c = fetchRhor(k + 1);
_rho[i] += ((c.w * p + c.x) * p + c.y) * p + c.z;
}
}
@ -127,12 +127,12 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
if(ii < _inum) {
F_FLOAT p = _rho[i] * _rdrho + F_F(1.0);
F_CFLOAT p = _rho[i] * _rdrho + F_F(1.0);
int m = static_cast<int>(p);
m = MAX(1, MIN(m, _nrho - 1));
p -= m;
p = MIN(p, F_F(1.0));
F_FLOAT* coeff = &_frho_spline[(static_cast <int>(_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH];
F_CFLOAT* coeff = &_frho_spline[(static_cast <int>(_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH];
_fp[i] = (coeff[0] * p + coeff[1]) * p + coeff[2];
if(eflag || eflag_atom) {
@ -148,17 +148,17 @@ __global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vf
_eatom[i] += sharedmem[threadIdx.x];
reduceBlock(sharedmem);
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0) * sharedmem[0];
}
}
__global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vflag_atom)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
ENERGY_CFLOAT* sharedE;
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag || eflag_atom) {
@ -178,10 +178,10 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_FLOAT xtmp, ytmp, ztmp;
X_FLOAT4 myxtype;
F_FLOAT fxtmp, fytmp, fztmp, fpair;
F_FLOAT delx, dely, delz;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
F_CFLOAT delx, dely, delz;
int itype, i;
int jnum = 0;
int* jlist;
@ -206,7 +206,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
_rho[i] = F_F(0.0);
}
if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_FLOAT*) _buffer)[ii];
if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_CFLOAT*) _buffer)[ii];
__syncthreads();
@ -219,35 +219,35 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < _cutsq_global) {
F_FLOAT r = _SQRT_(rsq);
F_FLOAT p = r * _rdr + F_F(1.0);
F_CFLOAT r = _SQRT_(rsq);
F_CFLOAT p = r * _rdr + F_F(1.0);
int m = static_cast<int>(p);
m = MIN(m, _nr - 1);
p -= m;
p = MIN(p, F_F(1.0));
int k = (static_cast <int>(_type2rhor[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2;
F_FLOAT4 c = fetchRhor(k);
F_FLOAT rhoip = (c.x * p + c.y) * p + c.z;
F_CFLOAT4 c = fetchRhor(k);
F_CFLOAT rhoip = (c.x * p + c.y) * p + c.z;
k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2;
c = fetchRhor(k);
F_FLOAT rhojp = (c.x * p + c.y) * p + c.z;
F_CFLOAT rhojp = (c.x * p + c.y) * p + c.z;
k = (static_cast <int>(_type2z2r[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2;
c = fetchZ2r(k);
F_FLOAT z2p = (c.x * p + c.y) * p + c.z;
F_CFLOAT z2p = (c.x * p + c.y) * p + c.z;
c = fetchZ2r(k + 1);
F_FLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z;
F_CFLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z;
F_FLOAT recip = F_F(1.0) / r;
F_FLOAT phi = z2 * recip;
F_FLOAT phip = z2p * recip - phi * recip;
F_FLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip;
F_CFLOAT recip = F_F(1.0) / r;
F_CFLOAT phi = z2 * recip;
F_CFLOAT phip = z2p * recip - phi * recip;
F_CFLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip;
fpair = -psip * recip;
F_FLOAT dxfp, dyfp, dzfp;
F_CFLOAT dxfp, dyfp, dzfp;
fxtmp += dxfp = delx * fpair;
fytmp += dyfp = dely * fpair;
fztmp += dzfp = delz * fpair;
@ -268,10 +268,10 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
__syncthreads();
if(ii < _inum) {
F_FLOAT* my_f;
F_CFLOAT* my_f;
if(_collect_forces_later) {
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
buffer = &buffer[1 * gridDim.x * gridDim.y];
@ -281,7 +281,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_FLOAT*) buffer;
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = fxtmp;
my_f += _nmax;
@ -320,7 +320,7 @@ __global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vf
if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0);
}
__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_FLOAT* buffer)
__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_CFLOAT* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
int* list = sendlist + iswap * maxlistlength;
@ -331,7 +331,7 @@ __global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlen
}
}
__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_FLOAT* buffer)
__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_CFLOAT* buffer)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;

View File

@ -37,10 +37,10 @@
void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
{
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed");
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_FLOAT));
int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_CFLOAT));
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_FLOAT);
int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_CFLOAT);
if(sdata->buffersize < size) {
MYDBG(printf("Cuda_PairGranHookeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
@ -72,15 +72,15 @@ void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neig
cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*));
cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_CFLOAT4*));
cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(maxneighbors), &sneighlist->maxneighbors , sizeof(int));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*));
cudaMemcpyToSymbol(MY_AP(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int));
@ -101,32 +101,32 @@ void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata)
"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1);
unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
unsigned n = sizeof(F_CFLOAT) * cuda_ntypes2;
F_FLOAT coeffs1[cuda_ntypes2];
coeffs1[0] = (F_FLOAT) sdata->pair.coeff1[0][0];
coeffs1[1] = (F_FLOAT) sdata->pair.coeff1[0][1];
coeffs1[2] = (F_FLOAT) sdata->pair.coeff1[1][0];
F_FLOAT coeffs3[cuda_ntypes2];
coeffs3[0] = (F_FLOAT) sdata->pair.coeff1[1][1];
F_FLOAT coeffs2[cuda_ntypes2];
coeffs2[0] = (F_FLOAT) sdata->pair.coeff2[0][0];
coeffs2[1] = (F_FLOAT) sdata->pair.coeff2[0][1];
F_CFLOAT coeffs1[cuda_ntypes2];
coeffs1[0] = (F_CFLOAT) sdata->pair.coeff1[0][0];
coeffs1[1] = (F_CFLOAT) sdata->pair.coeff1[0][1];
coeffs1[2] = (F_CFLOAT) sdata->pair.coeff1[1][0];
F_CFLOAT coeffs3[cuda_ntypes2];
coeffs3[0] = (F_CFLOAT) sdata->pair.coeff1[1][1];
F_CFLOAT coeffs2[cuda_ntypes2];
coeffs2[0] = (F_CFLOAT) sdata->pair.coeff2[0][0];
coeffs2[1] = (F_CFLOAT) sdata->pair.coeff2[0][1];
X_FLOAT box_size[3] = {
X_CFLOAT box_size[3] = {
sdata->domain.subhi[0] - sdata->domain.sublo[0],
sdata->domain.subhi[1] - sdata->domain.sublo[1],
sdata->domain.subhi[2] - sdata->domain.sublo[2]
};
//printf("n: %i %i\n",n,CUDA_MAX_TYPES2);
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_CFLOAT) * 3);
cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned));
cudaMemcpyToSymbol(MY_AP(coeff1) , coeffs1 , n);
cudaMemcpyToSymbol(MY_AP(coeff2) , coeffs2 , n);
cudaMemcpyToSymbol(MY_AP(coeff3) , coeffs3 , n);
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_CFLOAT*));
cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed");
}
@ -156,7 +156,7 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
if(vflag) sharedperproc += 6;
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT), 128);
int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_CFLOAT), 128);
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
@ -168,11 +168,11 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
Cuda_PairGranHookeCuda_Init(sdata);
}
MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_CFLOAT)*threads.x);)
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation");
PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id
, (F_FLOAT) sdata->pair.coeff1[0][0], (F_FLOAT) sdata->pair.coeff1[1][0], (F_FLOAT) sdata->pair.coeff1[1][1], (F_FLOAT) sdata->pair.coeff2[0][0]);
PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id
, (F_CFLOAT) sdata->pair.coeff1[0][0], (F_CFLOAT) sdata->pair.coeff1[1][0], (F_CFLOAT) sdata->pair.coeff1[1][1], (F_CFLOAT) sdata->pair.coeff2[0][0]);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed");
@ -181,7 +181,7 @@ void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
grid.x = sharedperproc;
grid.y = 1;
threads.x = 256;
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed");
}

View File

@ -23,12 +23,12 @@
__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id
, F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu)
, F_CFLOAT kn, F_CFLOAT gamman, F_CFLOAT gammat, F_CFLOAT xmu)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedV;
ENERGY_CFLOAT* sharedE;
ENERGY_CFLOAT* sharedV;
if(eflag || eflag_atom) {
sharedE = &sharedmem[threadIdx.x];
@ -51,18 +51,18 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");)
X_FLOAT xtmp, ytmp, ztmp;
X_CFLOAT xtmp, ytmp, ztmp;
X_FLOAT4 myxtype;
V_FLOAT4 myvradius, ovradius;
F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp;
F_FLOAT delx, dely, delz;
F_FLOAT radi, radj, radsum, r, rsqinv;
F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
F_FLOAT wr1, wr2, wr3;
F_FLOAT vtr1, vtr2, vtr3, vrel;
F_FLOAT meff, damp, ccel, tor1, tor2, tor3;
F_FLOAT fn, fs, ft, fs1, fs2, fs3;
X_CFLOAT4 myxtype;
V_CFLOAT4 myvradius, ovradius;
F_CFLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp;
F_CFLOAT delx, dely, delz;
F_CFLOAT radi, radj, radsum, r, rsqinv;
F_CFLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
F_CFLOAT wr1, wr2, wr3;
F_CFLOAT vtr1, vtr2, vtr3, vrel;
F_CFLOAT meff, damp, ccel, tor1, tor2, tor3;
F_CFLOAT fn, fs, ft, fs1, fs2, fs3;
int jnum = 0;
int i, j;
@ -108,10 +108,10 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
radj = ovradius.w;
radsum = radi + radj;
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < radsum * radsum) {
const F_FLOAT rinv = _RSQRT_(rsq);
const F_CFLOAT rinv = _RSQRT_(rsq);
r = F_F(1.0) / rinv;
rsqinv = F_F(1.0) / rsq;
@ -135,8 +135,8 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
vt3 = vr3 - vn3;
// relative rotational velocity
V_FLOAT4 omegarmass_i = fetchOmegaRmass(i);
V_FLOAT4 omegarmass_j = fetchOmegaRmass(j);
V_CFLOAT4 omegarmass_i = fetchOmegaRmass(i);
V_CFLOAT4 omegarmass_j = fetchOmegaRmass(j);
wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv;
wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv;
@ -165,7 +165,7 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
fs2 = -ft * vtr2;
fs3 = -ft * vtr3;
F_FLOAT dxfp, dyfp, dzfp;
F_CFLOAT dxfp, dyfp, dzfp;
fxtmp += dxfp = delx * ccel + fs1;
fytmp += dyfp = dely * ccel + fs2;
fztmp += dzfp = delz * ccel + fs3;
@ -194,13 +194,13 @@ __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, i
__syncthreads();
if(ii < _inum) {
F_FLOAT* my_f = _f + i;
F_CFLOAT* my_f = _f + i;
*my_f += fxtmp;
my_f += _nmax;
*my_f += fytmp;
my_f += _nmax;
*my_f += fztmp;
F_FLOAT* my_torque = _torque + i;
F_CFLOAT* my_torque = _torque + i;
*my_torque += torquextmp;
my_torque += _nmax;
*my_torque += torqueytmp;

View File

@ -63,10 +63,10 @@ void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -21,12 +21,12 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
__device__ inline F_CFLOAT PairLJ96CutCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
const F_FLOAT r3inv = _SQRT_(r6inv);
const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]);
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
const F_CFLOAT r3inv = _SQRT_(r6inv);
const F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]);
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv - _lj4[ij_type]) - _offset[ij_type]);

View File

@ -33,12 +33,12 @@
#include <time.h>
void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv)
void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_lj_inv, F_CFLOAT denom_coul_inv)
{
Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_CFLOAT));
return;
}
@ -46,7 +46,7 @@ void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_c
void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul)
int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul)
{
static short init = 0;
@ -65,10 +65,10 @@ void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighl
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -23,4 +23,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul);
extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul);

View File

@ -20,24 +20,24 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
__device__ inline F_CFLOAT PairLJCharmmCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
F_FLOAT philj, switch1;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
F_CFLOAT philj, switch1;
if(rsq > _cut_innersq_global) {
switch1 = (_cutsq_global - rsq) * (_cutsq_global - rsq) *
(_cutsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_innersq_global) * _denom_lj_inv;
const F_FLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) *
const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) *
(rsq - _cut_innersq_global) * _denom_lj_inv;
philj = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]);
forcelj = forcelj * switch1 + philj * switch2;
}
if(eflag) {
ENERGY_FLOAT evdwl_tmp = factor_lj;
ENERGY_CFLOAT evdwl_tmp = factor_lj;
if(rsq > _cut_innersq_global) {
evdwl_tmp *= philj * switch1;
@ -50,16 +50,16 @@ __device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij
return factor_lj * forcelj * r2inv;
}
__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
__device__ inline F_CFLOAT CoulCharmmCuda_Eval(const F_CFLOAT &rsq, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij)
{
F_FLOAT forcecoul;
ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul;
F_CFLOAT forcecoul;
ENERGY_CFLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul;
if(rsq > _cut_coul_innersq_global) {
const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
const F_CFLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
(_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv;
ecoul_tmp *= switch1;
const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
(rsq - _cut_coul_innersq_global) * _denom_coul_inv;
forcecoul *= switch1 + switch2;
}

View File

@ -30,9 +30,9 @@
#define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global)
#define _denom_lj_inv MY_AP(denom_lj_inv)
#define _denom_coul_inv MY_AP(denom_coul_inv)
__device__ __constant__ F_FLOAT _cut_coul_innersq_global;
__device__ __constant__ F_FLOAT _denom_lj_inv;
__device__ __constant__ F_FLOAT _denom_coul_inv;
__device__ __constant__ F_CFLOAT _cut_coul_innersq_global;
__device__ __constant__ F_CFLOAT _denom_lj_inv;
__device__ __constant__ F_CFLOAT _denom_coul_inv;
#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h"
@ -40,12 +40,12 @@ __device__ __constant__ F_FLOAT _denom_coul_inv;
#include <time.h>
void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv)
void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_lj_inv, F_CFLOAT denom_coul_inv)
{
Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_CFLOAT));
return;
}
@ -53,7 +53,7 @@ void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLO
void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul)
int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul)
{
static short init = 0;
@ -72,10 +72,10 @@ void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_share
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -23,4 +23,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul);
extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj, F_CFLOAT cut_coul_innersq, F_CFLOAT denom_coul);

View File

@ -21,16 +21,16 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
__device__ inline F_CFLOAT CoulCharmmImplicitCuda_Eval(const F_CFLOAT &rsq, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij)
{
F_FLOAT forcecoul;
ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul;
F_CFLOAT forcecoul;
ENERGY_CFLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul;
if(rsq > _cut_coul_innersq_global) {
const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
const F_CFLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
(_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv;
ecoul_tmp *= switch1;
const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
const F_CFLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
(rsq - _cut_coul_innersq_global) * _denom_coul_inv;
forcecoul *= (switch1 + switch2);
}

View File

@ -32,10 +32,10 @@
#include <time.h>
void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_lj_inv)
void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_CFLOAT denom_lj_inv)
{
Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_CFLOAT));
return;
}
@ -43,7 +43,7 @@ void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_l
void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
int eflag_atom, int vflag_atom, F_FLOAT denom_lj)
int eflag_atom, int vflag_atom, F_CFLOAT denom_lj)
{
static short init = 0;
@ -62,10 +62,10 @@ void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlis
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -23,4 +23,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj);
extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT denom_lj);

View File

@ -58,10 +58,10 @@ void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -58,10 +58,10 @@ void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlis
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -52,7 +52,7 @@ void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig
dim3 grid, threads;
int sharedperproc;
//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
//int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT);
//if(CUDA_ARCH==20) maxthreads*=2;
//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
@ -60,10 +60,10 @@ void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -21,11 +21,11 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
__device__ inline F_CFLOAT PairLJClass2Cuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
const F_FLOAT r3inv = _SQRT_(r6inv);
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
const F_CFLOAT r3inv = _SQRT_(r6inv);
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv -
_lj4[ij_type]) - _offset[ij_type]);

View File

@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* s
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -58,10 +58,10 @@ void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist*
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -52,7 +52,7 @@ void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli
dim3 grid, threads;
int sharedperproc;
//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
//int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT);
//if(CUDA_ARCH==20) maxthreads*=2;
//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
@ -60,10 +60,10 @@ void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighli
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -21,10 +21,10 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
__device__ inline F_CFLOAT PairLJCutCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_FLOAT r2inv = F_F(1.0) / rsq;
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv -
_lj4[ij_type]) - _offset[ij_type]);

View File

@ -51,7 +51,7 @@ void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighli
dim3 grid, threads;
int sharedperproc;
//int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
//int maxthreads=192*sizeof(double)/sizeof(F_CFLOAT);
//if(CUDA_ARCH==20) maxthreads*=2;
//cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
@ -64,10 +64,10 @@ void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighli
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA_opt<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -62,10 +62,10 @@ void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneig
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -20,14 +20,14 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
__device__ inline F_CFLOAT PairLJExpandCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
{
const F_FLOAT r = _SQRT_(rsq);
const F_FLOAT rshift = r - _shift[ij_type];
const F_FLOAT rshiftsq = rshift * rshift;
const F_FLOAT r2inv = F_F(1.0) / rshiftsq;
const F_FLOAT r6inv = r2inv * r2inv * r2inv;
const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
const F_CFLOAT r = _SQRT_(rsq);
const F_CFLOAT rshift = r - _shift[ij_type];
const F_CFLOAT rshiftsq = rshift * rshift;
const F_CFLOAT r2inv = F_F(1.0) / rshiftsq;
const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
const F_CFLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - _offset[ij_type]);

View File

@ -37,10 +37,10 @@
#define _coulsw1 MY_AP(coulsw1)
#define _coulsw2 MY_AP(coulsw2)
#define _coulsw5 MY_AP(coulsw5)
__device__ __constant__ F_FLOAT _cut_coul_inner_global;
__device__ __constant__ F_FLOAT _coulsw1;
__device__ __constant__ F_FLOAT _coulsw2;
__device__ __constant__ F_FLOAT _coulsw5;
__device__ __constant__ F_CFLOAT _cut_coul_inner_global;
__device__ __constant__ F_CFLOAT _coulsw1;
__device__ __constant__ F_CFLOAT _coulsw2;
__device__ __constant__ F_CFLOAT _coulsw5;
#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h"
@ -48,13 +48,13 @@ __device__ __constant__ F_FLOAT _coulsw5;
#include <time.h>
void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5)
void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5)
{
Cuda_Pair_Init_AllStyles(sdata, 9, true, true, true);
cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1 , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2 , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5 , sizeof(F_FLOAT));
cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1 , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2 , sizeof(F_CFLOAT));
cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5 , sizeof(F_CFLOAT));
return;
}
@ -62,7 +62,7 @@ void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut
void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5)
int eflag_atom, int vflag_atom, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5)
{
static short init = 0;
@ -80,10 +80,10 @@ void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neig
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
}

View File

@ -23,4 +23,4 @@
#include "cuda_shared.h"
extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5);
extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_CFLOAT cut_coul_inner, F_CFLOAT coulsw1, F_CFLOAT coulsw2, F_CFLOAT coulsw5);

View File

@ -21,23 +21,23 @@
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
__device__ inline F_CFLOAT CoulGromacsCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_coul, int &eflag, ENERGY_CFLOAT &ecoul, F_CFLOAT qij)
{
if(qij != F_F(0.0)) {
F_FLOAT ecoul_tmp;
F_FLOAT forcecoul = _RSQRT_(rsq);
F_CFLOAT ecoul_tmp;
F_CFLOAT forcecoul = _RSQRT_(rsq);
if(eflag) ecoul_tmp = forcecoul - _coulsw5;
if(rsq > _cut_coul_inner_global * _cut_coul_inner_global) {
const F_FLOAT r = F_F(1.0) / forcecoul;
const F_FLOAT tc = r - _cut_coul_inner_global;
const F_CFLOAT r = F_F(1.0) / forcecoul;
const F_CFLOAT tc = r - _cut_coul_inner_global;
forcecoul += r * tc * tc * (_coulsw1 + _coulsw2 * tc);
if(eflag) ecoul_tmp -= tc * tc * tc * (_coulsw1 * (F_F(1.0) / F_F(3.0)) + _coulsw2 * tc * (F_F(1.0) / F_F(4.0)));
}
F_FLOAT qprod = _qqrd2e * qij * factor_coul;
F_CFLOAT qprod = _qqrd2e * qij * factor_coul;
forcecoul *= qprod;
if(eflag) {

View File

@ -64,10 +64,10 @@ void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* snei
if(sdata->pair.use_block_per_atom)
Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
else
Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE>
<<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
<<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);

Some files were not shown because too many files have changed in this diff Show More